Skip to content

Commit aae92b4

Browse files
Merge pull request #5050 from openshift-cherrypick-robot/cherry-pick-5037-to-release-4.19
[release-4.19] OCPBUGS-56180: Add hot loop detection in the boot image controller
2 parents 6682dd1 + 6afdfa1 commit aae92b4

File tree

2 files changed

+145
-0
lines changed

2 files changed

+145
-0
lines changed

pkg/controller/machine-set-boot-image/machine_set_boot_image_controller.go

+54
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package machineset
22

33
import (
4+
"bytes"
45
"context"
56
"encoding/json"
67
"fmt"
@@ -61,6 +62,7 @@ type Controller struct {
6162
mapiStats MachineResourceStats
6263
capiMachineSetStats MachineResourceStats
6364
capiMachineDeploymentStats MachineResourceStats
65+
mapiBootImageState map[string]BootImageState
6466
conditionMutex sync.Mutex
6567
mapiSyncMutex sync.Mutex
6668

@@ -74,6 +76,13 @@ type MachineResourceStats struct {
7476
totalCount int
7577
}
7678

79+
// State structure uses for detecting hot loops. Reset when cluster is opted
80+
// out of boot image updates.
81+
type BootImageState struct {
82+
value []byte
83+
hotLoopCount int
84+
}
85+
7786
// Helper function that checks if all resources have been evaluated
7887
func (mrs MachineResourceStats) isFinished() bool {
7988
return mrs.totalCount == (mrs.inProgress + mrs.erroredCount)
@@ -91,6 +100,9 @@ const (
91100

92101
ArchLabelKey = "kubernetes.io/arch="
93102
OSLabelKey = "machine.openshift.io/os-id"
103+
104+
// Threshold for hot loop detection
105+
HotLoopLimit = 3
94106
)
95107

96108
// New returns a new machine-set-boot-image controller.
@@ -145,6 +157,8 @@ func New(
145157

146158
ctrl.featureGateAccess = featureGateAccess
147159

160+
ctrl.mapiBootImageState = map[string]BootImageState{}
161+
148162
return ctrl
149163
}
150164

@@ -355,6 +369,11 @@ func (ctrl *Controller) syncMAPIMachineSets(reason string) {
355369
}
356370
if !machineManagerFound {
357371
klog.V(4).Infof("No MAPI machineset manager was found, so no MAPI machinesets will be enrolled.")
372+
// clear out MAPI boot image history
373+
for k := range ctrl.mapiBootImageState {
374+
delete(ctrl.mapiBootImageState, k)
375+
}
376+
358377
}
359378

360379
mapiMachineSets, err := ctrl.mapiMachineSetLister.List(machineResourceSelector)
@@ -367,6 +386,10 @@ func (ctrl *Controller) syncMAPIMachineSets(reason string) {
367386
// If no machine resources were enrolled; exit the enqueue process without errors.
368387
if len(mapiMachineSets) == 0 {
369388
klog.Infof("No MAPI machinesets were enrolled, so no MAPI machinesets will be enqueued.")
389+
// clear out MAPI boot image history
390+
for k := range ctrl.mapiBootImageState {
391+
delete(ctrl.mapiBootImageState, k)
392+
}
370393
}
371394

372395
// Reset stats before initiating reconciliation loop
@@ -477,13 +500,44 @@ func (ctrl *Controller) syncMAPIMachineSet(machineSet *machinev1beta1.MachineSet
477500

478501
// Patch the machineset if required
479502
if patchRequired {
503+
// First, check if we're hot looping
504+
if ctrl.checkMAPIMachineSetHotLoop(newMachineSet) {
505+
return fmt.Errorf("refusing to reconcile machineset %s, hot loop detected. Please opt-out of boot image updates, adjust your machine provisioning workflow to prevent hot loops and opt back in to resume boot image updates", machineSet.Name)
506+
}
480507
klog.Infof("Patching MAPI machineset %s", machineSet.Name)
481508
return ctrl.patchMachineSet(machineSet, newMachineSet)
482509
}
483510
klog.Infof("No patching required for MAPI machineset %s", machineSet.Name)
484511
return nil
485512
}
486513

514+
// Checks against a local store of boot image updates to detect hot looping
515+
func (ctrl *Controller) checkMAPIMachineSetHotLoop(machineSet *machinev1beta1.MachineSet) bool {
516+
bis, ok := ctrl.mapiBootImageState[machineSet.Name]
517+
if !ok {
518+
// If the machineset doesn't currently have a record, create a new one.
519+
ctrl.mapiBootImageState[machineSet.Name] = BootImageState{
520+
value: machineSet.Spec.Template.Spec.ProviderSpec.Value.Raw,
521+
hotLoopCount: 1,
522+
}
523+
} else {
524+
hotLoopCount := 1
525+
// If the controller is updating to a value that was previously updated to, increase the hot loop counter
526+
if bytes.Equal(bis.value, machineSet.Spec.Template.Spec.ProviderSpec.Value.Raw) {
527+
hotLoopCount = (bis.hotLoopCount) + 1
528+
}
529+
// Return an error and degrade if the hot loop counter is above threshold
530+
if hotLoopCount > HotLoopLimit {
531+
return true
532+
}
533+
ctrl.mapiBootImageState[machineSet.Name] = BootImageState{
534+
value: machineSet.Spec.Template.Spec.ProviderSpec.Value.Raw,
535+
hotLoopCount: hotLoopCount,
536+
}
537+
}
538+
return false
539+
}
540+
487541
// This function patches the machineset object using the machineClient
488542
// Returns an error if marshsalling or patching fails.
489543
func (ctrl *Controller) patchMachineSet(oldMachineSet, newMachineSet *machinev1beta1.MachineSet) error {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
package machineset
2+
3+
import (
4+
"testing"
5+
6+
machinev1beta1 "github.com/openshift/api/machine/v1beta1"
7+
"github.com/stretchr/testify/assert"
8+
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
9+
"k8s.io/apimachinery/pkg/runtime"
10+
)
11+
12+
func TestHotLoop(t *testing.T) {
13+
cases := []struct {
14+
name string
15+
machineset *machinev1beta1.MachineSet
16+
generateBootImageFunc func(string) string
17+
updateCount int
18+
expectHotLoop bool
19+
}{
20+
{
21+
name: "Hot loop detected due to multiple updates to same value",
22+
machineset: getMachineSet("machine-set-1", "boot-image-1"),
23+
updateCount: HotLoopLimit + 1,
24+
generateBootImageFunc: func(s string) string { return s },
25+
expectHotLoop: true,
26+
},
27+
{
28+
name: "Hot loop not detected due to update count being lower than threshold",
29+
machineset: getMachineSet("machine-set-1", "boot-image-1"),
30+
generateBootImageFunc: func(s string) string { return s },
31+
updateCount: HotLoopLimit - 1,
32+
expectHotLoop: false,
33+
},
34+
{
35+
name: "Hot loop not detected due to target boot image changing",
36+
machineset: getMachineSet("machine-set-1", "boot-image-1"),
37+
generateBootImageFunc: func(s string) string { return s + "-1" },
38+
updateCount: HotLoopLimit + 1,
39+
expectHotLoop: false,
40+
},
41+
}
42+
for _, tc := range cases {
43+
t.Run(tc.name, func(t *testing.T) {
44+
t.Parallel()
45+
46+
ctrl := &Controller{
47+
mapiBootImageState: map[string]BootImageState{},
48+
}
49+
// checkMAPIMachineSetHotLoop is only called in the controller when a patch is required,
50+
// so every call to it is assuming a boot image update is going to take place.
51+
52+
// No hot loops should be detected in the first (updateCount - 1) calls
53+
var hotLoopDetected bool
54+
for range tc.updateCount - 1 {
55+
hotLoopDetected = ctrl.checkMAPIMachineSetHotLoop(tc.machineset)
56+
assert.Equal(t, false, hotLoopDetected)
57+
// Change target boot image for next iteration
58+
setMachineSetBootImage(tc.machineset, tc.generateBootImageFunc)
59+
}
60+
// Check for hot loop on the last iteration
61+
hotLoopDetected = ctrl.checkMAPIMachineSetHotLoop(tc.machineset)
62+
assert.Equal(t, tc.expectHotLoop, hotLoopDetected)
63+
})
64+
}
65+
}
66+
67+
// Returns a machineset with a given boot image
68+
func getMachineSet(name, bootImage string) *machinev1beta1.MachineSet {
69+
return &machinev1beta1.MachineSet{
70+
ObjectMeta: v1.ObjectMeta{
71+
Name: name,
72+
},
73+
Spec: machinev1beta1.MachineSetSpec{
74+
Template: machinev1beta1.MachineTemplateSpec{
75+
Spec: machinev1beta1.MachineSpec{
76+
ProviderSpec: machinev1beta1.ProviderSpec{
77+
Value: &runtime.RawExtension{
78+
Raw: []byte(bootImage),
79+
},
80+
},
81+
},
82+
},
83+
},
84+
}
85+
}
86+
87+
// Sets the boot image in a machineset based on the generate function passed in
88+
func setMachineSetBootImage(machineset *machinev1beta1.MachineSet, generateBootImageFunc func(string) string) {
89+
currentBootImage := string(machineset.Spec.Template.Spec.ProviderSpec.Value.Raw)
90+
machineset.Spec.Template.Spec.ProviderSpec.Value.Raw = []byte(generateBootImageFunc(currentBootImage))
91+
}

0 commit comments

Comments
 (0)