Skip to content

Commit 7cfebea

Browse files
Merge pull request #5045 from openshift-cherrypick-robot/cherry-pick-4841-to-release-4.19
[release-4.19] OCPBUGS-56121: Adapt MCC to use LayeredNodeState and remove LayeredPoolState
2 parents cdf75d4 + 3221687 commit 7cfebea

10 files changed

+601
-788
lines changed

pkg/controller/common/layered_node_state.go

Lines changed: 146 additions & 148 deletions
Large diffs are not rendered by default.

pkg/controller/common/layered_node_state_test.go

Lines changed: 168 additions & 109 deletions
Large diffs are not rendered by default.

pkg/controller/common/layered_pool_state.go

Lines changed: 0 additions & 84 deletions
This file was deleted.

pkg/controller/common/layered_pool_state_test.go

Lines changed: 0 additions & 86 deletions
This file was deleted.

pkg/controller/node/node_controller.go

Lines changed: 26 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -635,8 +635,10 @@ func (ctrl *Controller) updateNode(old, cur interface{}) {
635635
}
636636

637637
var changed bool
638-
oldReadyErr := checkNodeReady(oldNode)
639-
newReadyErr := checkNodeReady(curNode)
638+
oldLNS := ctrlcommon.NewLayeredNodeState(oldNode)
639+
curLNS := ctrlcommon.NewLayeredNodeState(curNode)
640+
oldReadyErr := oldLNS.CheckNodeReady()
641+
newReadyErr := curLNS.CheckNodeReady()
640642

641643
oldReady := getErrorString(oldReadyErr)
642644
newReady := getErrorString(newReadyErr)
@@ -652,7 +654,7 @@ func (ctrl *Controller) updateNode(old, cur interface{}) {
652654

653655
// Specifically log when a node has completed an update so the MCC logs are a useful central aggregate of state changes
654656
if oldNode.Annotations[daemonconsts.CurrentMachineConfigAnnotationKey] != oldNode.Annotations[daemonconsts.DesiredMachineConfigAnnotationKey] &&
655-
isNodeDone(curNode, false) {
657+
curLNS.IsNodeDone() {
656658
ctrl.logPoolNode(pool, curNode, "Completed update to %s", curNode.Annotations[daemonconsts.DesiredMachineConfigAnnotationKey])
657659
changed = true
658660
} else {
@@ -1092,12 +1094,14 @@ func (ctrl *Controller) syncMachineConfigPool(key string) error {
10921094
ctx := context.TODO()
10931095
for _, node := range nodes {
10941096
// All the nodes that need to be upgraded should have `NodeUpdateInProgressTaint` so that they're less likely
1095-
// to be chosen during the scheduling cycle.
1097+
// to be chosen during the scheduling cycle. This includes nodes which are:
1098+
// (i) In a Pool being updated to a new MC or image
1099+
// (ii) In a Pool that is being opted out of layering
10961100
hasInProgressTaint := checkIfNodeHasInProgressTaint(node)
10971101

10981102
lns := ctrlcommon.NewLayeredNodeState(node)
10991103

1100-
if lns.IsDesiredEqualToPool(pool, layered) {
1104+
if (!layered && lns.IsDesiredMachineConfigEqualToPool(pool) && !lns.AreImageAnnotationsPresentOnNode()) || (layered && lns.IsDesiredEqualToBuild(mosc, mosb)) {
11011105
if hasInProgressTaint {
11021106
if err := ctrl.removeUpdateInProgressTaint(ctx, node.Name); err != nil {
11031107
err = fmt.Errorf("failed removing %s taint for node %s: %w", constants.NodeUpdateInProgressTaint.Key, node.Name, err)
@@ -1200,7 +1204,7 @@ func (ctrl *Controller) setClusterConfigAnnotation(nodes []*corev1.Node) error {
12001204

12011205
// updateCandidateNode needs to understand MOSB
12021206
// specifically, the LayeredNodeState probably needs to understand mosb
1203-
func (ctrl *Controller) updateCandidateNode(mosc *mcfgv1.MachineOSConfig, mosb *mcfgv1.MachineOSBuild, nodeName string, pool *mcfgv1.MachineConfigPool) error {
1207+
func (ctrl *Controller) updateCandidateNode(mosc *mcfgv1.MachineOSConfig, mosb *mcfgv1.MachineOSBuild, nodeName string, pool *mcfgv1.MachineConfigPool, layered bool) error {
12041208
return clientretry.RetryOnConflict(constants.NodeUpdateBackoff, func() error {
12051209
oldNode, err := ctrl.kubeClient.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{})
12061210
if err != nil {
@@ -1212,30 +1216,9 @@ func (ctrl *Controller) updateCandidateNode(mosc *mcfgv1.MachineOSConfig, mosb *
12121216
}
12131217

12141218
lns := ctrlcommon.NewLayeredNodeState(oldNode)
1215-
layered, err := ctrl.isLayeredPool(mosc, mosb)
1216-
if err != nil {
1217-
return fmt.Errorf("Failed to determine whether pool %s opts in to OCL due to an error: %s", pool.Name, err)
1218-
}
1219-
if mosb == nil {
1220-
if lns.IsDesiredEqualToPool(pool, layered) {
1221-
// If the node's desired annotations match the pool, return directly without updating the node.
1222-
klog.V(4).Infof("Pool %s: node %s: no update is needed", pool.Name, nodeName)
1223-
return nil
1224-
1225-
}
1226-
lns.SetDesiredStateFromPool(layered, pool)
1227-
1219+
if !layered {
1220+
lns.SetDesiredStateFromPool(pool)
12281221
} else {
1229-
if lns.IsDesiredEqualToBuild(mosc, mosb) {
1230-
// If the node's desired annotations match the pool, return directly without updating the node.
1231-
klog.V(4).Infof("Pool %s: node %s: no update is needed", pool.Name, nodeName)
1232-
return nil
1233-
}
1234-
// ensure this is happening. it might not be.
1235-
// we need to ensure the node controller is triggered at all the same times
1236-
// when using this new system
1237-
// we know the mosc+mosb can trigger one another and cause a build, but if the node controller
1238-
// can't set this anno, and subsequently cannot trigger the daemon to update, we need to rework.
12391222
lns.SetDesiredStateFromMachineOSConfig(mosc, mosb)
12401223
}
12411224

@@ -1246,6 +1229,12 @@ func (ctrl *Controller) updateCandidateNode(mosc *mcfgv1.MachineOSConfig, mosb *
12461229
return err
12471230
}
12481231

1232+
// Don't make a patch call if no update is needed.
1233+
if reflect.DeepEqual(newData, oldData) {
1234+
return nil
1235+
}
1236+
1237+
klog.V(4).Infof("Pool %s: layered=%v node %s update is needed", pool.Name, layered, nodeName)
12491238
patchBytes, err := strategicpatch.CreateTwoWayMergePatch(oldData, newData, corev1.Node{})
12501239
if err != nil {
12511240
return fmt.Errorf("failed to create patch for node %q: %w", nodeName, err)
@@ -1258,7 +1247,7 @@ func (ctrl *Controller) updateCandidateNode(mosc *mcfgv1.MachineOSConfig, mosb *
12581247
// getAllCandidateMachines returns all possible nodes which can be updated to the target config, along with a maximum
12591248
// capacity. It is the reponsibility of the caller to choose a subset of the nodes given the capacity.
12601249
func getAllCandidateMachines(layered bool, config *mcfgv1.MachineOSConfig, build *mcfgv1.MachineOSBuild, pool *mcfgv1.MachineConfigPool, nodesInPool []*corev1.Node, maxUnavailable int) ([]*corev1.Node, uint) {
1261-
unavail := getUnavailableMachines(nodesInPool, pool, layered, build)
1250+
unavail := getUnavailableMachines(nodesInPool, pool)
12621251
if len(unavail) >= maxUnavailable {
12631252
klog.V(4).Infof("getAllCandidateMachines: No capacity left for pool %s (unavail=%d >= maxUnavailable=%d)",
12641253
pool.Name, len(unavail), maxUnavailable)
@@ -1272,26 +1261,18 @@ func getAllCandidateMachines(layered bool, config *mcfgv1.MachineOSConfig, build
12721261
var nodes []*corev1.Node
12731262
for _, node := range nodesInPool {
12741263
lns := ctrlcommon.NewLayeredNodeState(node)
1275-
if !layered {
1276-
if lns.IsDesiredEqualToPool(pool, layered) {
1277-
if isNodeMCDFailing(node) {
1278-
failingThisConfig++
1279-
}
1280-
continue
1281-
}
1282-
} else {
1283-
if lns.IsDesiredEqualToBuild(config, build) {
1284-
// If the node's desired annotations match the pool, return directly without updating the node.
1285-
klog.V(4).Infof("Pool %s: layered node %s: no update is needed", pool.Name, node.Name)
1286-
continue
1264+
if !lns.CheckNodeCandidacyForUpdate(layered, pool, config, build) {
1265+
if lns.IsNodeMCDFailing() {
1266+
failingThisConfig++
12871267
}
1268+
continue
12881269
}
12891270
// Ignore nodes that are currently mid-update or unscheduled
1290-
if !isNodeReady(node) {
1271+
if !lns.IsNodeReady() {
12911272
klog.V(4).Infof("node %s skipped during candidate selection as it is currently unscheduled", node.Name)
12921273
continue
12931274
}
1294-
klog.V(4).Infof("Pool %s: selected candidate node %s", pool.Name, node.Name)
1275+
klog.Infof("Pool %s: selected candidate node %s", pool.Name, node.Name)
12951276
nodes = append(nodes, node)
12961277
}
12971278
// Nodes which are failing to target this config also count against
@@ -1308,15 +1289,6 @@ func getAllCandidateMachines(layered bool, config *mcfgv1.MachineOSConfig, build
13081289
return nodes, uint(capacity)
13091290
}
13101291

1311-
// getCandidateMachines returns the maximum subset of nodes which can be updated to the target config given availability constraints.
1312-
func getCandidateMachines(pool *mcfgv1.MachineConfigPool, config *mcfgv1.MachineOSConfig, build *mcfgv1.MachineOSBuild, nodesInPool []*corev1.Node, maxUnavailable int, layered bool) []*corev1.Node {
1313-
nodes, capacity := getAllCandidateMachines(layered, config, build, pool, nodesInPool, maxUnavailable)
1314-
if uint(len(nodes)) < capacity {
1315-
return nodes
1316-
}
1317-
return nodes[:capacity]
1318-
}
1319-
13201292
// getOperatorPodNodeName fetches the name of the current node running the machine-config-operator pod
13211293
func (ctrl *Controller) getOperatorNodeName() (string, error) {
13221294
// Create a selector object with a filter on the machine-config-operator pod
@@ -1392,7 +1364,7 @@ func (ctrl *Controller) setDesiredAnnotations(layered bool, mosc *mcfgv1.Machine
13921364
klog.Infof("Continuing to sync layered MachineConfigPool %s", pool.Name)
13931365
}
13941366
for _, node := range candidates {
1395-
if err := ctrl.updateCandidateNode(mosc, mosb, node.Name, pool); err != nil {
1367+
if err := ctrl.updateCandidateNode(mosc, mosb, node.Name, pool, layered); err != nil {
13961368
return fmt.Errorf("setting desired %s for node %s: %w", pool.Spec.Configuration.Name, node.Name, err)
13971369
}
13981370
}

0 commit comments

Comments
 (0)