From 6afdfa18c8b56aa1728d5520384c0d005676a884 Mon Sep 17 00:00:00 2001 From: David Date: Thu, 8 May 2025 16:05:08 -0400 Subject: [PATCH] msbic: add hot loop detection --- .../machine_set_boot_image_controller.go | 54 +++++++++++ .../machine_set_boot_image_controller_test.go | 91 +++++++++++++++++++ 2 files changed, 145 insertions(+) create mode 100644 pkg/controller/machine-set-boot-image/machine_set_boot_image_controller_test.go diff --git a/pkg/controller/machine-set-boot-image/machine_set_boot_image_controller.go b/pkg/controller/machine-set-boot-image/machine_set_boot_image_controller.go index 29f444ada3..4d9a1843e6 100644 --- a/pkg/controller/machine-set-boot-image/machine_set_boot_image_controller.go +++ b/pkg/controller/machine-set-boot-image/machine_set_boot_image_controller.go @@ -1,6 +1,7 @@ package machineset import ( + "bytes" "context" "encoding/json" "fmt" @@ -61,6 +62,7 @@ type Controller struct { mapiStats MachineResourceStats capiMachineSetStats MachineResourceStats capiMachineDeploymentStats MachineResourceStats + mapiBootImageState map[string]BootImageState conditionMutex sync.Mutex mapiSyncMutex sync.Mutex @@ -74,6 +76,13 @@ type MachineResourceStats struct { totalCount int } +// State structure uses for detecting hot loops. Reset when cluster is opted +// out of boot image updates. +type BootImageState struct { + value []byte + hotLoopCount int +} + // Helper function that checks if all resources have been evaluated func (mrs MachineResourceStats) isFinished() bool { return mrs.totalCount == (mrs.inProgress + mrs.erroredCount) @@ -91,6 +100,9 @@ const ( ArchLabelKey = "kubernetes.io/arch=" OSLabelKey = "machine.openshift.io/os-id" + + // Threshold for hot loop detection + HotLoopLimit = 3 ) // New returns a new machine-set-boot-image controller. @@ -145,6 +157,8 @@ func New( ctrl.featureGateAccess = featureGateAccess + ctrl.mapiBootImageState = map[string]BootImageState{} + return ctrl } @@ -355,6 +369,11 @@ func (ctrl *Controller) syncMAPIMachineSets(reason string) { } if !machineManagerFound { klog.V(4).Infof("No MAPI machineset manager was found, so no MAPI machinesets will be enrolled.") + // clear out MAPI boot image history + for k := range ctrl.mapiBootImageState { + delete(ctrl.mapiBootImageState, k) + } + } mapiMachineSets, err := ctrl.mapiMachineSetLister.List(machineResourceSelector) @@ -367,6 +386,10 @@ func (ctrl *Controller) syncMAPIMachineSets(reason string) { // If no machine resources were enrolled; exit the enqueue process without errors. if len(mapiMachineSets) == 0 { klog.Infof("No MAPI machinesets were enrolled, so no MAPI machinesets will be enqueued.") + // clear out MAPI boot image history + for k := range ctrl.mapiBootImageState { + delete(ctrl.mapiBootImageState, k) + } } // Reset stats before initiating reconciliation loop @@ -477,6 +500,10 @@ func (ctrl *Controller) syncMAPIMachineSet(machineSet *machinev1beta1.MachineSet // Patch the machineset if required if patchRequired { + // First, check if we're hot looping + if ctrl.checkMAPIMachineSetHotLoop(newMachineSet) { + return fmt.Errorf("refusing to reconcile machineset %s, hot loop detected. Please opt-out of boot image updates, adjust your machine provisioning workflow to prevent hot loops and opt back in to resume boot image updates", machineSet.Name) + } klog.Infof("Patching MAPI machineset %s", machineSet.Name) return ctrl.patchMachineSet(machineSet, newMachineSet) } @@ -484,6 +511,33 @@ func (ctrl *Controller) syncMAPIMachineSet(machineSet *machinev1beta1.MachineSet return nil } +// Checks against a local store of boot image updates to detect hot looping +func (ctrl *Controller) checkMAPIMachineSetHotLoop(machineSet *machinev1beta1.MachineSet) bool { + bis, ok := ctrl.mapiBootImageState[machineSet.Name] + if !ok { + // If the machineset doesn't currently have a record, create a new one. + ctrl.mapiBootImageState[machineSet.Name] = BootImageState{ + value: machineSet.Spec.Template.Spec.ProviderSpec.Value.Raw, + hotLoopCount: 1, + } + } else { + hotLoopCount := 1 + // If the controller is updating to a value that was previously updated to, increase the hot loop counter + if bytes.Equal(bis.value, machineSet.Spec.Template.Spec.ProviderSpec.Value.Raw) { + hotLoopCount = (bis.hotLoopCount) + 1 + } + // Return an error and degrade if the hot loop counter is above threshold + if hotLoopCount > HotLoopLimit { + return true + } + ctrl.mapiBootImageState[machineSet.Name] = BootImageState{ + value: machineSet.Spec.Template.Spec.ProviderSpec.Value.Raw, + hotLoopCount: hotLoopCount, + } + } + return false +} + // This function patches the machineset object using the machineClient // Returns an error if marshsalling or patching fails. func (ctrl *Controller) patchMachineSet(oldMachineSet, newMachineSet *machinev1beta1.MachineSet) error { diff --git a/pkg/controller/machine-set-boot-image/machine_set_boot_image_controller_test.go b/pkg/controller/machine-set-boot-image/machine_set_boot_image_controller_test.go new file mode 100644 index 0000000000..80f006dd11 --- /dev/null +++ b/pkg/controller/machine-set-boot-image/machine_set_boot_image_controller_test.go @@ -0,0 +1,91 @@ +package machineset + +import ( + "testing" + + machinev1beta1 "github.com/openshift/api/machine/v1beta1" + "github.com/stretchr/testify/assert" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" +) + +func TestHotLoop(t *testing.T) { + cases := []struct { + name string + machineset *machinev1beta1.MachineSet + generateBootImageFunc func(string) string + updateCount int + expectHotLoop bool + }{ + { + name: "Hot loop detected due to multiple updates to same value", + machineset: getMachineSet("machine-set-1", "boot-image-1"), + updateCount: HotLoopLimit + 1, + generateBootImageFunc: func(s string) string { return s }, + expectHotLoop: true, + }, + { + name: "Hot loop not detected due to update count being lower than threshold", + machineset: getMachineSet("machine-set-1", "boot-image-1"), + generateBootImageFunc: func(s string) string { return s }, + updateCount: HotLoopLimit - 1, + expectHotLoop: false, + }, + { + name: "Hot loop not detected due to target boot image changing", + machineset: getMachineSet("machine-set-1", "boot-image-1"), + generateBootImageFunc: func(s string) string { return s + "-1" }, + updateCount: HotLoopLimit + 1, + expectHotLoop: false, + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + ctrl := &Controller{ + mapiBootImageState: map[string]BootImageState{}, + } + // checkMAPIMachineSetHotLoop is only called in the controller when a patch is required, + // so every call to it is assuming a boot image update is going to take place. + + // No hot loops should be detected in the first (updateCount - 1) calls + var hotLoopDetected bool + for range tc.updateCount - 1 { + hotLoopDetected = ctrl.checkMAPIMachineSetHotLoop(tc.machineset) + assert.Equal(t, false, hotLoopDetected) + // Change target boot image for next iteration + setMachineSetBootImage(tc.machineset, tc.generateBootImageFunc) + } + // Check for hot loop on the last iteration + hotLoopDetected = ctrl.checkMAPIMachineSetHotLoop(tc.machineset) + assert.Equal(t, tc.expectHotLoop, hotLoopDetected) + }) + } +} + +// Returns a machineset with a given boot image +func getMachineSet(name, bootImage string) *machinev1beta1.MachineSet { + return &machinev1beta1.MachineSet{ + ObjectMeta: v1.ObjectMeta{ + Name: name, + }, + Spec: machinev1beta1.MachineSetSpec{ + Template: machinev1beta1.MachineTemplateSpec{ + Spec: machinev1beta1.MachineSpec{ + ProviderSpec: machinev1beta1.ProviderSpec{ + Value: &runtime.RawExtension{ + Raw: []byte(bootImage), + }, + }, + }, + }, + }, + } +} + +// Sets the boot image in a machineset based on the generate function passed in +func setMachineSetBootImage(machineset *machinev1beta1.MachineSet, generateBootImageFunc func(string) string) { + currentBootImage := string(machineset.Spec.Template.Spec.ProviderSpec.Value.Raw) + machineset.Spec.Template.Spec.ProviderSpec.Value.Raw = []byte(generateBootImageFunc(currentBootImage)) +}