Skip to content

Commit 3347699

Browse files
cheesesashimidkhater-redhat
authored andcommitted
unify update paths for non-OCL and OCL and avoid reboots by recreating and reusing MOSB's with updated rendered spec
1 parent cb7cd68 commit 3347699

File tree

10 files changed

+613
-285
lines changed

10 files changed

+613
-285
lines changed

pkg/controller/build/clients.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
type informers struct {
2626
controllerConfigInformer mcfginformersv1.ControllerConfigInformer
2727
machineConfigPoolInformer mcfginformersv1.MachineConfigPoolInformer
28+
machineConfigInformer mcfginformersv1.MachineConfigInformer
2829
jobInformer batchinformersv1.JobInformer
2930
machineOSBuildInformer mcfginformersv1.MachineOSBuildInformer
3031
machineOSConfigInformer mcfginformersv1.MachineOSConfigInformer
@@ -46,6 +47,7 @@ func (i *informers) listers() *listers {
4647
machineOSBuildLister: i.machineOSBuildInformer.Lister(),
4748
machineOSConfigLister: i.machineOSConfigInformer.Lister(),
4849
machineConfigPoolLister: i.machineConfigPoolInformer.Lister(),
50+
machineConfigLister: i.machineConfigInformer.Lister(),
4951
jobLister: i.jobInformer.Lister(),
5052
controllerConfigLister: i.controllerConfigInformer.Lister(),
5153
nodeLister: i.nodeInformer.Lister(),
@@ -57,6 +59,7 @@ type listers struct {
5759
machineOSBuildLister mcfglistersv1.MachineOSBuildLister
5860
machineOSConfigLister mcfglistersv1.MachineOSConfigLister
5961
machineConfigPoolLister mcfglistersv1.MachineConfigPoolLister
62+
machineConfigLister mcfglistersv1.MachineConfigLister
6063
jobLister batchlisterv1.JobLister
6164
controllerConfigLister mcfglistersv1.ControllerConfigLister
6265
nodeLister corelistersv1.NodeLister
@@ -91,6 +94,7 @@ func newInformers(mcfgclient mcfgclientset.Interface, kubeclient clientset.Inter
9194
controllerConfigInformer := mcoInformerFactory.Machineconfiguration().V1().ControllerConfigs()
9295
machineConfigPoolInformer := mcoInformerFactory.Machineconfiguration().V1().MachineConfigPools()
9396
machineOSBuildInformer := mcoInformerFactory.Machineconfiguration().V1().MachineOSBuilds()
97+
machineConfigInformer := mcoInformerFactory.Machineconfiguration().V1().MachineConfigs()
9498
machineOSConfigInformer := mcoInformerFactory.Machineconfiguration().V1().MachineOSConfigs()
9599
jobInformer := coreInformerFactory.Batch().V1().Jobs()
96100
nodeInformer := coreInformerFactoryNodes.Core().V1().Nodes()
@@ -100,6 +104,7 @@ func newInformers(mcfgclient mcfgclientset.Interface, kubeclient clientset.Inter
100104
machineConfigPoolInformer: machineConfigPoolInformer,
101105
machineOSBuildInformer: machineOSBuildInformer,
102106
machineOSConfigInformer: machineOSConfigInformer,
107+
machineConfigInformer: machineConfigInformer,
103108
jobInformer: jobInformer,
104109
nodeInformer: nodeInformer,
105110
toStart: []interface{ Start(<-chan struct{}) }{

pkg/controller/build/osbuildcontroller_test.go

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -756,3 +756,36 @@ func assertMachineOSConfigGetsCurrentBuildAnnotation(ctx context.Context, t *tes
756756

757757
require.NoError(t, err)
758758
}
759+
760+
// Test that when the MCP’s rendered-MC name changes but the two MCs only differ
761+
// by on-cluster layering, no Build Job is created.
762+
func TestOSBuildControllerSkipsBuildForLayerOnlyChanges(t *testing.T) {
763+
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
764+
t.Cleanup(cancel)
765+
poolName := "worker"
766+
767+
_, mcfgclient, _, _, mosc, firstMosb, mcp, kubeassert := setupOSBuildControllerForTestWithSuccessfulBuild(ctx, t, poolName)
768+
isMachineOSBuildReachedExpectedCount(ctx, t, mcfgclient, mosc, 1)
769+
770+
assertMachineOSConfigGetsCurrentBuildAnnotation(ctx, t, mcfgclient, mosc, firstMosb)
771+
772+
isMachineOSBuildReachedExpectedCount(ctx, t, mcfgclient, mosc, 1)
773+
774+
insertNewRenderedMachineConfigAndUpdatePool(ctx, t, mcfgclient, mcp.Name, "rendered-worker-layer-only")
775+
776+
// Give the controller a moment to (not) kick off a build
777+
time.Sleep(200 * time.Millisecond)
778+
779+
mosbList, err := mcfgclient.MachineconfigurationV1().
780+
MachineOSBuilds().
781+
List(ctx, metav1.ListOptions{LabelSelector: utils.MachineOSBuildForPoolSelector(mosc).String()})
782+
require.NoError(t, err)
783+
require.Len(t, mosbList.Items, 2, "expected a new MOSB to be created for layering-only change")
784+
assert.Equal(t, firstMosb.Name, mosbList.Items[0].Name, "first MOSB should remain unchanged")
785+
786+
layerOnlyMosb := mosbList.Items[1]
787+
788+
jobName := utils.GetBuildJobName(&layerOnlyMosb)
789+
790+
kubeassert.JobDoesNotExist(jobName, "layering-only MOSB should not spawn a build Job")
791+
}

pkg/controller/build/reconciler.go

Lines changed: 150 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
imagev1clientset "github.com/openshift/client-go/image/clientset/versioned"
1414
mcfgclientset "github.com/openshift/client-go/machineconfiguration/clientset/versioned"
1515
routeclientset "github.com/openshift/client-go/route/clientset/versioned"
16+
"github.com/openshift/machine-config-operator/pkg/apihelpers"
1617
"github.com/openshift/machine-config-operator/pkg/controller/build/buildrequest"
1718
"github.com/openshift/machine-config-operator/pkg/controller/build/constants"
1819
"github.com/openshift/machine-config-operator/pkg/controller/build/imagebuilder"
@@ -373,13 +374,11 @@ func (b *buildReconciler) UpdateMachineConfigPool(ctx context.Context, oldMCP, c
373374
func (b *buildReconciler) updateMachineConfigPool(ctx context.Context, oldMCP, curMCP *mcfgv1.MachineConfigPool) error {
374375
if oldMCP.Spec.Configuration.Name != curMCP.Spec.Configuration.Name {
375376
klog.Infof("Rendered config for pool %s changed from %s to %s", curMCP.Name, oldMCP.Spec.Configuration.Name, curMCP.Spec.Configuration.Name)
376-
if err := b.createNewMachineOSBuildOrReuseExistingForPoolChange(ctx, curMCP); err != nil {
377+
if err := b.reconcilePoolChange(ctx, curMCP); err != nil {
377378
return fmt.Errorf("could not create or reuse existing MachineOSBuild for MachineConfigPool %q change: %w", curMCP.Name, err)
378379
}
379380
}
380381

381-
// Not sure if we need to do this here yet or not.
382-
// TODO: Determine if we should call b.syncMachineConfigPools() here or not.
383382
return b.syncAll(ctx)
384383
}
385384

@@ -556,7 +555,7 @@ func (b *buildReconciler) createNewMachineOSBuildOrReuseExisting(ctx context.Con
556555
// In this situation, we've determined that the MachineOSBuild does not
557556
// exist, so we need to create it.
558557
if k8serrors.IsNotFound(err) {
559-
_, err := b.mcfgclient.MachineconfigurationV1().MachineOSBuilds().Create(ctx, mosb, metav1.CreateOptions{})
558+
mosb, err := b.mcfgclient.MachineconfigurationV1().MachineOSBuilds().Create(ctx, mosb, metav1.CreateOptions{})
560559
if err != nil && !k8serrors.IsAlreadyExists(err) {
561560
return fmt.Errorf("could not create new MachineOSBuild %q: %w", mosb.Name, err)
562561
}
@@ -1188,12 +1187,35 @@ func (b *buildReconciler) syncMachineOSBuild(ctx context.Context, mosb *mcfgv1.M
11881187
return fmt.Errorf("could not get MachineConfigPool from MachineOSBuild %q: %w", mosb.Name, err)
11891188
}
11901189

1191-
// An mosb which had previously been forgotten by the queue and is no longer desired by the mcp should not build
11921190
if mosb.ObjectMeta.Labels[constants.RenderedMachineConfigLabelKey] != mcp.Spec.Configuration.Name {
11931191
klog.Infof("The MachineOSBuild %q which builds the rendered Machine Config %q is no longer desired by the MCP %q", mosb.Name, mosb.ObjectMeta.Labels[constants.RenderedMachineConfigLabelKey], mosb.ObjectMeta.Labels[constants.TargetMachineConfigPoolLabelKey])
11941192
return nil
11951193
}
11961194

1195+
oldMCName := mcp.Status.Configuration.Name
1196+
newMCName := mcp.Spec.Configuration.Name
1197+
1198+
oldRendered, err := b.machineConfigLister.Get(oldMCName)
1199+
if err != nil {
1200+
return err
1201+
}
1202+
newRendered, err := b.machineConfigLister.Get(newMCName)
1203+
if err != nil {
1204+
return err
1205+
}
1206+
1207+
old := mcp.DeepCopy()
1208+
old.Spec.Configuration.Name = mcp.Status.Configuration.Name
1209+
1210+
needsImageRebuild, err := b.reconcileImageRebuild(old, mcp)
1211+
if err != nil {
1212+
return err
1213+
}
1214+
if oldRendered != newRendered && !needsImageRebuild {
1215+
klog.Infof("MachineOSBuild %q: only layering changes → skipping image build", mosb.Name)
1216+
return nil
1217+
}
1218+
11971219
mosbState := ctrlcommon.NewMachineOSBuildState(mosb)
11981220

11991221
if mosbState.IsInTerminalState() {
@@ -1328,6 +1350,128 @@ func (b *buildReconciler) syncMachineConfigPools(ctx context.Context) error {
13281350
// if needed.
13291351
func (b *buildReconciler) syncMachineConfigPool(ctx context.Context, mcp *mcfgv1.MachineConfigPool) error {
13301352
return b.timeObjectOperation(mcp, syncingVerb, func() error {
1331-
return b.createNewMachineOSBuildOrReuseExistingForPoolChange(ctx, mcp)
1353+
return b.reconcilePoolChange(ctx, mcp)
1354+
})
1355+
}
1356+
1357+
func (b *buildReconciler) reconcilePoolChange(ctx context.Context, mcp *mcfgv1.MachineConfigPool) error {
1358+
mosc, err := utils.GetMachineOSConfigForMachineConfigPool(mcp, b.utilListers())
1359+
if err != nil {
1360+
// Not opted in yet, so nothing for us to do.
1361+
klog.Infof("buildReconciler: no MachineOSConfig for pool %q, skipping", mcp.Name)
1362+
return nil
1363+
}
1364+
1365+
oldRendered := mcp.Status.Configuration.Name
1366+
newRendered := mcp.Spec.Configuration.Name
1367+
1368+
// old pool
1369+
old := mcp.DeepCopy()
1370+
old.Spec.Configuration.Name = mcp.Status.Configuration.Name
1371+
firstOptIn := mosc.Annotations[constants.CurrentMachineOSBuildAnnotationKey]
1372+
if firstOptIn == "" {
1373+
return fmt.Errorf("no current build annotation on MachineOSConfig %q", mosc.Name)
1374+
}
1375+
1376+
needsImageRebuild, err := b.reconcileImageRebuild(old, mcp)
1377+
if err != nil {
1378+
return err
1379+
}
1380+
1381+
if (oldRendered != newRendered && needsImageRebuild) || firstOptIn == "" {
1382+
klog.Infof("pool %q: detected extension/kernel/OSImageURL change → will rebuild image", mcp.Name)
1383+
} else if oldRendered != newRendered && !needsImageRebuild {
1384+
klog.Infof("pool %q: only layering changes → reusing existing image", mcp.Name)
1385+
prevPullSpec := mosc.Status.CurrentImagePullSpec
1386+
oldMOSB, err := utils.GetMachineOSBuildForImagePullspec(string(prevPullSpec), b.utilListers())
1387+
if err != nil {
1388+
return fmt.Errorf("failed to look up MachineOSBuild %q: %w", oldMOSB.Name, err)
1389+
}
1390+
return b.reuseImageForNewMOSB(ctx, mosc, oldMOSB)
1391+
}
1392+
1393+
return b.createNewMachineOSBuildOrReuseExisting(ctx, mosc, needsImageRebuild)
1394+
1395+
}
1396+
1397+
// reuseImageForNewMOSB creates a new MOSB (for the new rendered-MC name)
1398+
// but populates its status from oldMosb so that no build actually runs.
1399+
func (b *buildReconciler) reuseImageForNewMOSB(ctx context.Context, mosc *mcfgv1.MachineOSConfig, oldMosb *mcfgv1.MachineOSBuild,
1400+
) error {
1401+
// Look up the MCP
1402+
mcp, err := b.machineConfigPoolLister.Get(mosc.Spec.MachineConfigPool.Name)
1403+
if err != nil {
1404+
return err
1405+
}
1406+
1407+
// Build the new MOSB object
1408+
osImageURLs, err := ctrlcommon.GetOSImageURLConfig(ctx, b.kubeclient)
1409+
if err != nil {
1410+
return err
1411+
}
1412+
newMosb, err := buildrequest.NewMachineOSBuild(
1413+
buildrequest.MachineOSBuildOpts{
1414+
MachineOSConfig: mosc,
1415+
MachineConfigPool: mcp,
1416+
OSImageURLConfig: osImageURLs,
1417+
})
1418+
if err != nil {
1419+
return err
1420+
}
1421+
newMosb.SetOwnerReferences([]metav1.OwnerReference{
1422+
*metav1.NewControllerRef(mosc, mcfgv1.SchemeGroupVersion.WithKind("MachineOSConfig")),
13321423
})
1424+
1425+
// Create it if not already there
1426+
_, err = b.machineOSBuildLister.Get(newMosb.Name)
1427+
if k8serrors.IsNotFound(err) {
1428+
if newMosb, err = b.mcfgclient.
1429+
MachineconfigurationV1().
1430+
MachineOSBuilds().
1431+
Create(ctx, newMosb, metav1.CreateOptions{}); err != nil {
1432+
return err
1433+
}
1434+
} else if err != nil {
1435+
return err
1436+
}
1437+
1438+
// Change its status so it’s “Succeeded” with the old image
1439+
toUpdate, err := b.getMachineOSBuildForUpdate(newMosb)
1440+
if err != nil {
1441+
return err
1442+
}
1443+
oldStatus := toUpdate.Status
1444+
1445+
toUpdate.Status.DigestedImagePushSpec = oldMosb.Status.DigestedImagePushSpec
1446+
1447+
for _, c := range apihelpers.MachineOSBuildSucceededConditions() {
1448+
apihelpers.SetMachineOSBuildCondition(&toUpdate.Status, c)
1449+
}
1450+
1451+
if err := b.setStatusOnMachineOSBuildIfNeeded(ctx, toUpdate, oldStatus, toUpdate.Status); err != nil {
1452+
return err
1453+
}
1454+
1455+
// re-point the MC to the NEW build
1456+
return b.updateMachineOSConfigStatus(ctx, mosc, toUpdate)
1457+
}
1458+
1459+
// reconcilePoolChange returns (annotateRebuild bool, reuseOnly bool, err error)
1460+
func (b *buildReconciler) reconcileImageRebuild(oldMCP, curMCP *mcfgv1.MachineConfigPool) (bool, error) {
1461+
1462+
curr, err := b.machineConfigLister.Get(oldMCP.Spec.Configuration.Name)
1463+
if err != nil {
1464+
return false, err
1465+
}
1466+
des, err := b.machineConfigLister.Get(curMCP.Spec.Configuration.Name)
1467+
if err != nil {
1468+
return false, err
1469+
}
1470+
1471+
needsImageRebuild := ctrlcommon.RequiresRebuild(curr, des)
1472+
if needsImageRebuild {
1473+
return true, nil
1474+
}
1475+
1476+
return false, nil
13331477
}

pkg/controller/common/helpers.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1462,3 +1462,11 @@ func GetCAsFromConfigMap(cm *corev1.ConfigMap, key string) ([]byte, error) {
14621462
}
14631463
return nil, fmt.Errorf("%s not found in %s/%s", key, cm.Namespace, cm.Name)
14641464
}
1465+
1466+
// Determines if an on-cluster layering image rollout and rebuild is required for the changes applied on the new MC
1467+
func RequiresRebuild(oldMC, newMC *mcfgv1.MachineConfig) bool {
1468+
return oldMC.Spec.OSImageURL != newMC.Spec.OSImageURL ||
1469+
oldMC.Spec.KernelType != newMC.Spec.KernelType ||
1470+
!reflect.DeepEqual(oldMC.Spec.Extensions, newMC.Spec.Extensions) ||
1471+
!reflect.DeepEqual(oldMC.Spec.KernelArguments, newMC.Spec.KernelArguments)
1472+
}

pkg/controller/node/node_controller.go

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -901,7 +901,7 @@ func (ctrl *Controller) getConfigAndBuildAndLayeredStatus(pool *mcfgv1.MachineCo
901901

902902
isLayered, err := ctrl.isLayeredPool(mosc, mosb)
903903
if err != nil {
904-
return nil, nil, false, fmt.Errorf("Failed to determine whether pool %s opts in to OCL due to an error: %s", pool.Name, err)
904+
return nil, nil, false, fmt.Errorf("failed to check layered status: %w", err)
905905
}
906906

907907
return mosc, mosb, isLayered, nil
@@ -948,9 +948,8 @@ func (ctrl *Controller) getConfigAndBuild(pool *mcfgv1.MachineConfigPool) (*mcfg
948948
}
949949

950950
func (ctrl *Controller) canLayeredContinue(mosc *mcfgv1.MachineOSConfig, mosb *mcfgv1.MachineOSBuild) (string, bool, error) {
951-
// This is an edgecase which we should ideally never hit. However, it is
952-
// better to anticipate it and have an error message ready vs. the
953-
// alternative.
951+
952+
klog.Infof("Can Layered continue?")
954953
if mosc == nil && mosb != nil {
955954
msg := fmt.Sprintf("orphaned MachineOSBuild %q found, but MachineOSConfig %q not found", mosb.Name, mosb.Labels[buildconstants.MachineOSConfigNameLabelKey])
956955
return msg, false, fmt.Errorf("%s", msg)
@@ -968,6 +967,8 @@ func (ctrl *Controller) canLayeredContinue(mosc *mcfgv1.MachineOSConfig, mosb *m
968967

969968
if !hasImage {
970969
return "Desired image not set in MachineOSConfig", false, nil
970+
} else if hasImage {
971+
klog.Infof("pullspec is %s", pullspec)
971972
}
972973

973974
switch {
@@ -1050,6 +1051,7 @@ func (ctrl *Controller) syncMachineConfigPool(key string) error {
10501051
}
10511052

10521053
if layered {
1054+
klog.Infof("Continuing updates for layered pool %s", pool.Name)
10531055
reason, canApplyUpdates, err := ctrl.canLayeredContinue(mosc, mosb)
10541056
if err != nil {
10551057
klog.Infof("Layered pool %s encountered an error: %s", pool.Name, err)
@@ -1062,11 +1064,10 @@ func (ctrl *Controller) syncMachineConfigPool(key string) error {
10621064
return ctrl.syncStatusOnly(pool)
10631065
}
10641066

1065-
klog.V(4).Infof("Continuing updates for layered pool %s", pool.Name)
1066-
} else {
1067-
klog.V(4).Infof("Pool %s is not layered", pool.Name)
1067+
klog.Infof("Continuing updates for layered pool %s", pool.Name)
10681068
}
10691069

1070+
klog.Info("getting nodes for pool")
10701071
nodes, err := ctrl.getNodesForPool(pool)
10711072
if err != nil {
10721073
if syncErr := ctrl.syncStatusOnly(pool); syncErr != nil {
@@ -1076,6 +1077,7 @@ func (ctrl *Controller) syncMachineConfigPool(key string) error {
10761077
return err
10771078
}
10781079

1080+
klog.Info("maxunavailable check")
10791081
maxunavail, err := maxUnavailable(pool, nodes)
10801082
if err != nil {
10811083
if syncErr := ctrl.syncStatusOnly(pool); syncErr != nil {
@@ -1113,6 +1115,7 @@ func (ctrl *Controller) syncMachineConfigPool(key string) error {
11131115
}
11141116
}
11151117
}
1118+
klog.Info("get all candidate machines")
11161119
candidates, capacity := getAllCandidateMachines(layered, mosc, mosb, pool, nodes, maxunavail)
11171120
if len(candidates) > 0 {
11181121
zones := make(map[string]bool)
@@ -1130,6 +1133,7 @@ func (ctrl *Controller) syncMachineConfigPool(key string) error {
11301133
}
11311134
return err
11321135
}
1136+
klog.Info("updating state metric")
11331137
ctrlcommon.UpdateStateMetric(ctrlcommon.MCCSubControllerState, "machine-config-controller-node", "Sync Machine Config Pool", pool.Name)
11341138
}
11351139
return ctrl.syncStatusOnly(pool)
@@ -1282,16 +1286,16 @@ func getAllCandidateMachines(layered bool, config *mcfgv1.MachineOSConfig, build
12821286
} else {
12831287
if lns.IsDesiredEqualToBuild(config, build) {
12841288
// If the node's desired annotations match the pool, return directly without updating the node.
1285-
klog.V(4).Infof("Pool %s: layered node %s: no update is needed", pool.Name, node.Name)
1289+
klog.Infof("Pool %s: layered node %s: no update is needed", pool.Name, node.Name)
12861290
continue
12871291
}
12881292
}
12891293
// Ignore nodes that are currently mid-update or unscheduled
12901294
if !isNodeReady(node) {
1291-
klog.V(4).Infof("node %s skipped during candidate selection as it is currently unscheduled", node.Name)
1295+
klog.Infof("node %s skipped during candidate selection as it is currently unscheduled", node.Name)
12921296
continue
12931297
}
1294-
klog.V(4).Infof("Pool %s: selected candidate node %s", pool.Name, node.Name)
1298+
klog.Infof("Pool %s: selected candidate node %s", pool.Name, node.Name)
12951299
nodes = append(nodes, node)
12961300
}
12971301
// Nodes which are failing to target this config also count against
@@ -1388,9 +1392,11 @@ func (ctrl *Controller) setDesiredAnnotations(layered bool, mosc *mcfgv1.Machine
13881392

13891393
if layered {
13901394
eventName = "SetDesiredConfigAndOSImage"
1391-
updateName = fmt.Sprintf("%s / Image: %s", updateName, ctrlcommon.NewMachineOSConfigState(mosc).GetOSImage())
1392-
klog.Infof("Continuing to sync layered MachineConfigPool %s", pool.Name)
1395+
moscImage := ctrlcommon.NewMachineOSConfigState(mosc).GetOSImage()
1396+
updateName = fmt.Sprintf("%s / Image: %s", updateName, moscImage)
1397+
klog.Infof("Continuing to sync layered MachineConfigPool %s with %s", pool.Name, moscImage)
13931398
}
1399+
13941400
for _, node := range candidates {
13951401
if err := ctrl.updateCandidateNode(mosc, mosb, node.Name, pool); err != nil {
13961402
return fmt.Errorf("setting desired %s for node %s: %w", pool.Spec.Configuration.Name, node.Name, err)

0 commit comments

Comments
 (0)