-
Notifications
You must be signed in to change notification settings - Fork 218
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Alleviate race conditions in roll restart reconciler #694
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -67,13 +67,17 @@ func (r *RollingRestartReconciler) Reconcile() (ctrl.Result, error) { | |||||
status := r.findStatus() | ||||||
var pendingUpdate bool | ||||||
|
||||||
// Check that all nodes are ready before doing work | ||||||
// Also check if there are pending updates for all nodes. | ||||||
statefulSets := []appsv1.StatefulSet{} | ||||||
for _, nodePool := range r.instance.Spec.NodePools { | ||||||
sts, err := r.client.GetStatefulSet(builders.StsName(r.instance, &nodePool), r.instance.Namespace) | ||||||
if err != nil { | ||||||
return ctrl.Result{}, err | ||||||
} | ||||||
statefulSets = append(statefulSets, sts) | ||||||
} | ||||||
|
||||||
// Check if there are pending updates for all nodes. | ||||||
for _, sts := range statefulSets { | ||||||
if sts.Status.UpdateRevision != "" && | ||||||
sts.Status.UpdatedReplicas != pointer.Int32Deref(sts.Spec.Replicas, 1) { | ||||||
pendingUpdate = true | ||||||
|
@@ -91,12 +95,6 @@ func (r *RollingRestartReconciler) Reconcile() (ctrl.Result, error) { | |||||
} | ||||||
|
||||||
} | ||||||
if sts.Status.ReadyReplicas != pointer.Int32Deref(sts.Spec.Replicas, 1) { | ||||||
return ctrl.Result{ | ||||||
Requeue: true, | ||||||
RequeueAfter: 10 * time.Second, | ||||||
}, nil | ||||||
} | ||||||
} | ||||||
|
||||||
if !pendingUpdate { | ||||||
|
@@ -118,6 +116,47 @@ func (r *RollingRestartReconciler) Reconcile() (ctrl.Result, error) { | |||||
return ctrl.Result{}, nil | ||||||
} | ||||||
|
||||||
// Check if there is any crashed pod. Delete it if there is any update in sts. | ||||||
any_restarted_pod := false | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please use camelCase for any variables |
||||||
for _, sts := range statefulSets { | ||||||
restared_pod, err := helpers.DeleteStuckPodWithOlderRevision(r.client, &sts) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Typo
Suggested change
|
||||||
if err != nil { | ||||||
return ctrl.Result{}, err | ||||||
} | ||||||
if restared_pod { | ||||||
any_restarted_pod = true | ||||||
} | ||||||
} | ||||||
if any_restarted_pod { | ||||||
return ctrl.Result{ | ||||||
Requeue: true, | ||||||
RequeueAfter: 10 * time.Second, | ||||||
}, nil | ||||||
} | ||||||
|
||||||
// Check that all nodes of all pools are ready before doing work | ||||||
for _, nodePool := range r.instance.Spec.NodePools { | ||||||
sts, err := r.client.GetStatefulSet(builders.StsName(r.instance, &nodePool), r.instance.Namespace) | ||||||
if err != nil { | ||||||
return ctrl.Result{}, err | ||||||
} | ||||||
if sts.Status.ReadyReplicas != pointer.Int32Deref(sts.Spec.Replicas, 1) { | ||||||
return ctrl.Result{ | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add a log line (can be debug) so it is visible the operator is waiting on pods being ready |
||||||
Requeue: true, | ||||||
RequeueAfter: 10 * time.Second, | ||||||
}, nil | ||||||
} | ||||||
// CountRunningPodsForNodePool provides a more consistent view of the sts. Status of statefulset | ||||||
// is eventually consistent. Listing pods is more consistent. | ||||||
numReadyPods, err := helpers.CountRunningPodsForNodePool(r.client, r.instance, &nodePool) | ||||||
if err != nil || numReadyPods != int(pointer.Int32Deref(sts.Spec.Replicas, 1)) { | ||||||
return ctrl.Result{ | ||||||
Requeue: true, | ||||||
RequeueAfter: 10 * time.Second, | ||||||
}, nil | ||||||
} | ||||||
} | ||||||
|
||||||
// Skip a rolling restart if the cluster hasn't finished initializing | ||||||
if !r.instance.Status.Initialized { | ||||||
return ctrl.Result{ | ||||||
|
@@ -139,27 +178,17 @@ func (r *RollingRestartReconciler) Reconcile() (ctrl.Result, error) { | |||||
return ctrl.Result{}, err | ||||||
} | ||||||
|
||||||
// Restart StatefulSet pod. Order is not important So we just pick the first we find | ||||||
|
||||||
// Restart a single pod of a StatefulSet. Order is not important so we just pick the first we find | ||||||
for _, nodePool := range r.instance.Spec.NodePools { | ||||||
sts, err := r.client.GetStatefulSet(builders.StsName(r.instance, &nodePool), r.instance.Namespace) | ||||||
if err != nil { | ||||||
return ctrl.Result{}, err | ||||||
} | ||||||
if sts.Status.UpdateRevision != "" && | ||||||
sts.Status.UpdatedReplicas != pointer.Int32Deref(sts.Spec.Replicas, 1) { | ||||||
// Only restart pods if not all pods are updated and the sts is healthy with no pods terminating | ||||||
if sts.Status.ReadyReplicas == pointer.Int32Deref(sts.Spec.Replicas, 1) { | ||||||
if numReadyPods, err := helpers.CountRunningPodsForNodePool(r.client, r.instance, &nodePool); err == nil && numReadyPods == int(pointer.Int32Deref(sts.Spec.Replicas, 1)) { | ||||||
lg.Info(fmt.Sprintf("Starting rolling restart of the StatefulSet %s", sts.Name)) | ||||||
return r.restartStatefulSetPod(&sts) | ||||||
} | ||||||
} else { // Check if there is any crashed pod. Delete it if there is any update in sts. | ||||||
err = helpers.DeleteStuckPodWithOlderRevision(r.client, &sts) | ||||||
if err != nil { | ||||||
return ctrl.Result{}, err | ||||||
} | ||||||
} | ||||||
|
||||||
lg.Info(fmt.Sprintf("Starting rolling restart of the StatefulSet %s", sts.Name)) | ||||||
return r.restartStatefulSetPod(&sts) | ||||||
} | ||||||
} | ||||||
|
||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is not the correct file for the tests, they should go in a file
helpers_test.go
, thehelpers_suite_test.go
only contains the init/start code for the test of the entire package.