@@ -56,6 +56,8 @@ const (
5656 WorkerLabel = "node-role.kubernetes.io/worker"
5757 // ControlPlaneLabel defines the label associated with master/control-plane node.
5858 ControlPlaneLabel = "node-role.kubernetes.io/control-plane"
59+ // ArbiterLabel defines the label associated with arbiter node.
60+ ArbiterLabel = "node-role.kubernetes.io/arbiter"
5961
6062 // maxRetries is the number of times a machineconfig pool will be retried before it is dropped out of the queue.
6163 // With the current rate-limiter in use (5ms*2^(maxRetries-1)) the following numbers represent the times
@@ -1199,6 +1201,8 @@ func (ctrl *Controller) syncMachineConfigPool(key string) error {
11991201 var arbiterMosc * mcfgv1.MachineOSConfig
12001202 var arbiterMosb * mcfgv1.MachineOSBuild
12011203 var arbiterLayered bool
1204+ // If we are in HighlyAvailableArbiterMode, combine the arbiter nodes to use where we can avoid duplication.
1205+ var combinedNodes = append ([]* corev1.Node {}, nodes ... )
12021206 if pool .Name == ctrlcommon .MachineConfigPoolMaster && controlPlaneTopology == configv1 .HighlyAvailableArbiterMode {
12031207 arbiterObj , err := ctrl .mcpLister .Get (ctrlcommon .MachineConfigPoolArbiter )
12041208 if err != nil {
@@ -1214,28 +1218,21 @@ func (ctrl *Controller) syncMachineConfigPool(key string) error {
12141218 if err != nil {
12151219 return fmt .Errorf ("error getting config and build for arbiter pool %q, error: %w" , ctrlcommon .MachineConfigPoolArbiter , err )
12161220 }
1217- combinedNodes := append ([]* corev1.Node {}, nodes ... )
12181221 combinedNodes = append (combinedNodes , arbiterNodes ... )
12191222 combinedMax , err := maxUnavailable (pool , combinedNodes )
12201223 if err != nil {
12211224 return fmt .Errorf ("error getting max unavailable count for pool %q, error: %w" , pool .Name , err )
12221225 }
1223- arbiterUnavailable := len (getUnavailableMachines (arbiterNodes , arbiterPool ))
1224- // Adjust maxunavail to account for arbiter unavailable nodes
1225- // This ensures we don't exceed the combined maxUnavailable across both pools
1226- maxunavail = combinedMax - arbiterUnavailable
1227- if maxunavail < 0 {
1228- maxunavail = 0
1229- }
1226+ maxunavail = combinedMax
12301227 }
12311228 }
12321229
1233- if err := ctrl .setClusterConfigAnnotation (nodes , controlPlaneTopology ); err != nil {
1230+ if err := ctrl .setClusterConfigAnnotation (combinedNodes , controlPlaneTopology ); err != nil {
12341231 return fmt .Errorf ("error setting clusterConfig Annotation for node in pool %q, error: %w" , pool .Name , err )
12351232 }
12361233 // Taint all the nodes in the node pool, irrespective of their upgrade status.
12371234 ctx := context .TODO ()
1238- for _ , node := range nodes {
1235+ for _ , node := range combinedNodes {
12391236 // All the nodes that need to be upgraded should have `NodeUpdateInProgressTaint` so that they're less likely
12401237 // to be chosen during the scheduling cycle. This includes nodes which are:
12411238 // (i) In a Pool being updated to a new MC or image
@@ -1244,7 +1241,16 @@ func (ctrl *Controller) syncMachineConfigPool(key string) error {
12441241
12451242 lns := ctrlcommon .NewLayeredNodeState (node )
12461243
1247- if (! layered && lns .IsDesiredMachineConfigEqualToPool (pool ) && ! lns .AreImageAnnotationsPresentOnNode ()) || (layered && lns .IsDesiredEqualToBuild (mosc , mosb )) {
1244+ desiredPool := pool
1245+ desiredMosc := mosc
1246+ desiredMosb := mosb
1247+ if _ , ok := node .GetLabels ()[ArbiterLabel ]; ok {
1248+ desiredPool = arbiterPool
1249+ desiredMosc = arbiterMosc
1250+ desiredMosb = arbiterMosb
1251+ }
1252+
1253+ if (! layered && lns .IsDesiredMachineConfigEqualToPool (desiredPool ) && ! lns .AreImageAnnotationsPresentOnNode ()) || (layered && lns .IsDesiredEqualToBuild (desiredMosc , desiredMosb )) {
12481254 if hasInProgressTaint {
12491255 if err := ctrl .removeUpdateInProgressTaint (ctx , node .Name ); err != nil {
12501256 err = fmt .Errorf ("failed removing %s taint for node %s: %w" , constants .NodeUpdateInProgressTaint .Key , node .Name , err )
@@ -1260,8 +1266,8 @@ func (ctrl *Controller) syncMachineConfigPool(key string) error {
12601266 }
12611267 }
12621268 }
1269+
12631270 candidates , capacity := getAllCandidateMachines (layered , mosc , mosb , pool , nodes , maxunavail )
1264- masterTargeted := 0
12651271 if len (candidates ) > 0 {
12661272 zones := make (map [string ]bool )
12671273 for _ , candidate := range candidates {
@@ -1278,77 +1284,36 @@ func (ctrl *Controller) syncMachineConfigPool(key string) error {
12781284 }
12791285 return err
12801286 }
1281- masterTargeted = len (candidates )
12821287 ctrlcommon .UpdateStateMetric (ctrlcommon .MCCSubControllerState , "machine-config-controller-node" , "Sync Machine Config Pool" , pool .Name )
12831288 }
12841289
12851290 // If coordinating with arbiter pool, also handle arbiter node updates
1286- if arbiterPool != nil && len (arbiterNodes ) > 0 {
1287- // Set cluster config annotation for arbiter nodes
1288- if err := ctrl .setClusterConfigAnnotation (arbiterNodes , controlPlaneTopology ); err != nil {
1289- return fmt .Errorf ("error setting clusterConfig Annotation for node in pool %q, error: %w" , arbiterPool .Name , err )
1290- }
1291-
1292- // Handle taints for arbiter nodes
1293- for _ , node := range arbiterNodes {
1294- hasInProgressTaint := checkIfNodeHasInProgressTaint (node )
1295- lns := ctrlcommon .NewLayeredNodeState (node )
1296- if (! arbiterLayered && lns .IsDesiredMachineConfigEqualToPool (arbiterPool ) && ! lns .AreImageAnnotationsPresentOnNode ()) || (arbiterLayered && lns .IsDesiredEqualToBuild (arbiterMosc , arbiterMosb )) {
1297- if hasInProgressTaint {
1298- if err := ctrl .removeUpdateInProgressTaint (ctx , node .Name ); err != nil {
1299- err = fmt .Errorf ("failed removing %s taint for node %s: %w" , constants .NodeUpdateInProgressTaint .Key , node .Name , err )
1300- klog .Error (err )
1301- }
1302- }
1303- } else {
1304- if ! hasInProgressTaint {
1305- if err := ctrl .setUpdateInProgressTaint (ctx , node .Name ); err != nil {
1306- err = fmt .Errorf ("failed applying %s taint for node %s: %w" , constants .NodeUpdateInProgressTaint .Key , node .Name , err )
1307- klog .Error (err )
1308- }
1309- }
1310- }
1311- }
1312-
1313- // Calculate remaining capacity for arbiter after master updates
1314- masterUnavailable := len (getUnavailableMachines (nodes , pool ))
1291+ if arbiterPool != nil && len (arbiterNodes ) > 0 && pool .Name == ctrlcommon .MachineConfigPoolMaster && controlPlaneTopology == configv1 .HighlyAvailableArbiterMode {
13151292 arbiterUnavailable := len (getUnavailableMachines (arbiterNodes , arbiterPool ))
1316- combinedNodes := append ([]* corev1.Node {}, nodes ... )
1317- combinedNodes = append (combinedNodes , arbiterNodes ... )
1318- combinedMax , err := maxUnavailable (pool , combinedNodes )
1319- if err == nil {
1320- remainingCapacity := combinedMax - masterUnavailable - masterTargeted - arbiterUnavailable
1321- if remainingCapacity < 0 {
1322- remainingCapacity = 0
1323- }
1324- arbiterMaxUnavail := arbiterUnavailable + remainingCapacity
1325- if arbiterMaxUnavail < 0 {
1326- arbiterMaxUnavail = 0
1327- }
1328-
1329- arbiterCandidates , arbiterCapacity := getAllCandidateMachines (arbiterLayered , arbiterMosc , arbiterMosb , arbiterPool , arbiterNodes , arbiterMaxUnavail )
1330- if len (arbiterCandidates ) > 0 {
1331- zones := make (map [string ]bool )
1332- for _ , candidate := range arbiterCandidates {
1333- if zone , ok := candidate .Labels [zoneLabel ]; ok {
1334- zones [zone ] = true
1335- }
1336- }
1337- ctrl .logPool (arbiterPool , "%d candidate nodes in %d zones for update, capacity: %d" , len (arbiterCandidates ), len (zones ), arbiterCapacity )
1338- if err := ctrl .updateCandidateMachines (arbiterLayered , arbiterMosc , arbiterMosb , arbiterPool , arbiterCandidates , arbiterCapacity ); err != nil {
1339- if syncErr := ctrl .syncStatusOnly (arbiterPool ); syncErr != nil {
1340- errs := kubeErrs .NewAggregate ([]error {syncErr , err })
1341- return fmt .Errorf ("error setting annotations for pool %q, sync error: %w" , arbiterPool .Name , errs )
1342- }
1343- return err
1293+ //#nosec G115 -- there is no overflow, it's an int converted to uint and then added to another int
1294+ arbiterMaxUnavail := int (capacity ) - len (candidates ) + arbiterUnavailable
1295+ arbiterCandidates , arbiterCapacity := getAllCandidateMachines (arbiterLayered , arbiterMosc , arbiterMosb , arbiterPool , arbiterNodes , arbiterMaxUnavail )
1296+ if len (arbiterCandidates ) > 0 {
1297+ zones := make (map [string ]bool )
1298+ for _ , candidate := range arbiterCandidates {
1299+ if zone , ok := candidate .Labels [zoneLabel ]; ok {
1300+ zones [zone ] = true
13441301 }
1345- ctrlcommon .UpdateStateMetric (ctrlcommon .MCCSubControllerState , "machine-config-controller-node" , "Sync Machine Config Pool" , arbiterPool .Name )
13461302 }
1347-
1348- // Sync status for arbiter pool
1349- if err := ctrl .syncStatusOnly (arbiterPool ); err != nil {
1303+ ctrl .logPool (arbiterPool , "%d candidate nodes in %d zones for update, capacity: %d" , len (arbiterCandidates ), len (zones ), arbiterCapacity )
1304+ if err := ctrl .updateCandidateMachines (arbiterLayered , arbiterMosc , arbiterMosb , arbiterPool , arbiterCandidates , arbiterCapacity ); err != nil {
1305+ if syncErr := ctrl .syncStatusOnly (arbiterPool ); syncErr != nil {
1306+ errs := kubeErrs .NewAggregate ([]error {syncErr , err })
1307+ return fmt .Errorf ("error setting annotations for pool %q, sync error: %w" , arbiterPool .Name , errs )
1308+ }
13501309 return err
13511310 }
1311+ ctrlcommon .UpdateStateMetric (ctrlcommon .MCCSubControllerState , "machine-config-controller-node" , "Sync Machine Config Pool" , arbiterPool .Name )
1312+ }
1313+
1314+ // Sync status for arbiter pool
1315+ if err := ctrl .syncStatusOnly (arbiterPool ); err != nil {
1316+ return err
13521317 }
13531318 }
13541319
0 commit comments