Skip to content

[release-4.19] OCPBUGS-56180: Add hot loop detection in the boot image controller #5050

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package machineset

import (
"bytes"
"context"
"encoding/json"
"fmt"
Expand Down Expand Up @@ -61,6 +62,7 @@ type Controller struct {
mapiStats MachineResourceStats
capiMachineSetStats MachineResourceStats
capiMachineDeploymentStats MachineResourceStats
mapiBootImageState map[string]BootImageState
conditionMutex sync.Mutex
mapiSyncMutex sync.Mutex

Expand All @@ -74,6 +76,13 @@ type MachineResourceStats struct {
totalCount int
}

// State structure uses for detecting hot loops. Reset when cluster is opted
// out of boot image updates.
type BootImageState struct {
value []byte
hotLoopCount int
}

// Helper function that checks if all resources have been evaluated
func (mrs MachineResourceStats) isFinished() bool {
return mrs.totalCount == (mrs.inProgress + mrs.erroredCount)
Expand All @@ -91,6 +100,9 @@ const (

ArchLabelKey = "kubernetes.io/arch="
OSLabelKey = "machine.openshift.io/os-id"

// Threshold for hot loop detection
HotLoopLimit = 3
)

// New returns a new machine-set-boot-image controller.
Expand Down Expand Up @@ -145,6 +157,8 @@ func New(

ctrl.featureGateAccess = featureGateAccess

ctrl.mapiBootImageState = map[string]BootImageState{}

return ctrl
}

Expand Down Expand Up @@ -355,6 +369,11 @@ func (ctrl *Controller) syncMAPIMachineSets(reason string) {
}
if !machineManagerFound {
klog.V(4).Infof("No MAPI machineset manager was found, so no MAPI machinesets will be enrolled.")
// clear out MAPI boot image history
for k := range ctrl.mapiBootImageState {
delete(ctrl.mapiBootImageState, k)
}

}

mapiMachineSets, err := ctrl.mapiMachineSetLister.List(machineResourceSelector)
Expand All @@ -367,6 +386,10 @@ func (ctrl *Controller) syncMAPIMachineSets(reason string) {
// If no machine resources were enrolled; exit the enqueue process without errors.
if len(mapiMachineSets) == 0 {
klog.Infof("No MAPI machinesets were enrolled, so no MAPI machinesets will be enqueued.")
// clear out MAPI boot image history
for k := range ctrl.mapiBootImageState {
delete(ctrl.mapiBootImageState, k)
}
}

// Reset stats before initiating reconciliation loop
Expand Down Expand Up @@ -477,13 +500,44 @@ func (ctrl *Controller) syncMAPIMachineSet(machineSet *machinev1beta1.MachineSet

// Patch the machineset if required
if patchRequired {
// First, check if we're hot looping
if ctrl.checkMAPIMachineSetHotLoop(newMachineSet) {
return fmt.Errorf("refusing to reconcile machineset %s, hot loop detected. Please opt-out of boot image updates, adjust your machine provisioning workflow to prevent hot loops and opt back in to resume boot image updates", machineSet.Name)
}
klog.Infof("Patching MAPI machineset %s", machineSet.Name)
return ctrl.patchMachineSet(machineSet, newMachineSet)
}
klog.Infof("No patching required for MAPI machineset %s", machineSet.Name)
return nil
}

// Checks against a local store of boot image updates to detect hot looping
func (ctrl *Controller) checkMAPIMachineSetHotLoop(machineSet *machinev1beta1.MachineSet) bool {
bis, ok := ctrl.mapiBootImageState[machineSet.Name]
if !ok {
// If the machineset doesn't currently have a record, create a new one.
ctrl.mapiBootImageState[machineSet.Name] = BootImageState{
value: machineSet.Spec.Template.Spec.ProviderSpec.Value.Raw,
hotLoopCount: 1,
}
} else {
hotLoopCount := 1
// If the controller is updating to a value that was previously updated to, increase the hot loop counter
if bytes.Equal(bis.value, machineSet.Spec.Template.Spec.ProviderSpec.Value.Raw) {
hotLoopCount = (bis.hotLoopCount) + 1
}
// Return an error and degrade if the hot loop counter is above threshold
if hotLoopCount > HotLoopLimit {
return true
}
ctrl.mapiBootImageState[machineSet.Name] = BootImageState{
value: machineSet.Spec.Template.Spec.ProviderSpec.Value.Raw,
hotLoopCount: hotLoopCount,
}
}
return false
}

// This function patches the machineset object using the machineClient
// Returns an error if marshsalling or patching fails.
func (ctrl *Controller) patchMachineSet(oldMachineSet, newMachineSet *machinev1beta1.MachineSet) error {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
package machineset

import (
"testing"

machinev1beta1 "github.com/openshift/api/machine/v1beta1"
"github.com/stretchr/testify/assert"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
)

func TestHotLoop(t *testing.T) {
cases := []struct {
name string
machineset *machinev1beta1.MachineSet
generateBootImageFunc func(string) string
updateCount int
expectHotLoop bool
}{
{
name: "Hot loop detected due to multiple updates to same value",
machineset: getMachineSet("machine-set-1", "boot-image-1"),
updateCount: HotLoopLimit + 1,
generateBootImageFunc: func(s string) string { return s },
expectHotLoop: true,
},
{
name: "Hot loop not detected due to update count being lower than threshold",
machineset: getMachineSet("machine-set-1", "boot-image-1"),
generateBootImageFunc: func(s string) string { return s },
updateCount: HotLoopLimit - 1,
expectHotLoop: false,
},
{
name: "Hot loop not detected due to target boot image changing",
machineset: getMachineSet("machine-set-1", "boot-image-1"),
generateBootImageFunc: func(s string) string { return s + "-1" },
updateCount: HotLoopLimit + 1,
expectHotLoop: false,
},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
t.Parallel()

ctrl := &Controller{
mapiBootImageState: map[string]BootImageState{},
}
// checkMAPIMachineSetHotLoop is only called in the controller when a patch is required,
// so every call to it is assuming a boot image update is going to take place.

// No hot loops should be detected in the first (updateCount - 1) calls
var hotLoopDetected bool
for range tc.updateCount - 1 {
hotLoopDetected = ctrl.checkMAPIMachineSetHotLoop(tc.machineset)
assert.Equal(t, false, hotLoopDetected)
// Change target boot image for next iteration
setMachineSetBootImage(tc.machineset, tc.generateBootImageFunc)
}
// Check for hot loop on the last iteration
hotLoopDetected = ctrl.checkMAPIMachineSetHotLoop(tc.machineset)
assert.Equal(t, tc.expectHotLoop, hotLoopDetected)
})
}
}

// Returns a machineset with a given boot image
func getMachineSet(name, bootImage string) *machinev1beta1.MachineSet {
return &machinev1beta1.MachineSet{
ObjectMeta: v1.ObjectMeta{
Name: name,
},
Spec: machinev1beta1.MachineSetSpec{
Template: machinev1beta1.MachineTemplateSpec{
Spec: machinev1beta1.MachineSpec{
ProviderSpec: machinev1beta1.ProviderSpec{
Value: &runtime.RawExtension{
Raw: []byte(bootImage),
},
},
},
},
},
}
}

// Sets the boot image in a machineset based on the generate function passed in
func setMachineSetBootImage(machineset *machinev1beta1.MachineSet, generateBootImageFunc func(string) string) {
currentBootImage := string(machineset.Spec.Template.Spec.ProviderSpec.Value.Raw)
machineset.Spec.Template.Spec.ProviderSpec.Value.Raw = []byte(generateBootImageFunc(currentBootImage))
}