Skip to content

Commit bd6ed4c

Browse files
committed
OCPBUGS-23514: Better a message upon long CO updating
When it takes too long (90m+ for machine-config and 30m+ for others) to upgrade a cluster operator, clusterversion shows a message with the indication that the upgrade might hit some issue. This will cover the case in the related OCPBUGS-23538: for some reason, the pod under the deployment that manages the CO hit CrashLoopBackOff. Deployment controller does not give useful conditions in this situation [1]. Otherwise, checkDeploymentHealth [2] would detect it. Instead of CVO's figuring out the underlying pod's CrashLoopBackOff which might be better to be implemented by deployment controller, it is expected that our cluster admin starts to dig into the cluster when such a message pops up. For now, we just modify the condition's message. We could propagate Fail=True in case such requirements are collected from customers. [1]. kubernetes/kubernetes#106054 [2]. https://github.com/openshift/cluster-version-operator/blob/08c0459df5096e9f16fad3af2831b62d06d415ee/lib/resourcebuilder/apps.go#L79-L136
1 parent ff3778c commit bd6ed4c

File tree

2 files changed

+85
-0
lines changed

2 files changed

+85
-0
lines changed

pkg/cvo/status.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -555,6 +555,14 @@ func convertErrorToProgressing(now time.Time, statusFailure error) (reason strin
555555
case payload.UpdateEffectReport:
556556
return uErr.Reason, uErr.Error(), false
557557
case payload.UpdateEffectNone:
558+
m := time.Duration(30)
559+
// It takes longer to upgrade MCO
560+
if uErr.Name == "machine-config" {
561+
m = 3 * m
562+
}
563+
if payload.COUpdateStartTimesGet(uErr.Name).Before(now.Add(-(m * time.Minute))) {
564+
return uErr.Reason, fmt.Sprintf("waiting on %s over %d minutes which is longer than expected", uErr.Name, m), true
565+
}
558566
return uErr.Reason, fmt.Sprintf("waiting on %s", uErr.Name), true
559567
case payload.UpdateEffectFail:
560568
return "", "", false

pkg/cvo/status_test.go

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -358,12 +358,15 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t
358358
type args struct {
359359
syncWorkerStatus *SyncWorkerStatus
360360
}
361+
payload.COUpdateStartTimesEnsure("co-not-timeout")
362+
defer payload.COUpdateStartTimesRemove("co-not-timeout")
361363
tests := []struct {
362364
name string
363365
args args
364366
shouldModifyWhenNotReconcilingAndHistoryNotEmpty bool
365367
expectedConditionNotModified *configv1.ClusterOperatorStatusCondition
366368
expectedConditionModified *configv1.ClusterOperatorStatusCondition
369+
expectedProgressingCondition *configv1.ClusterOperatorStatusCondition
367370
}{
368371
{
369372
name: "no errors are present",
@@ -398,6 +401,7 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t
398401
UpdateEffect: payload.UpdateEffectNone,
399402
Reason: "ClusterOperatorUpdating",
400403
Message: "Cluster operator A is updating",
404+
Name: "co-not-timeout",
401405
},
402406
},
403407
},
@@ -412,6 +416,72 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t
412416
Type: ClusterStatusFailing,
413417
Status: configv1.ConditionFalse,
414418
},
419+
expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{
420+
Type: configv1.OperatorProgressing,
421+
Status: configv1.ConditionTrue,
422+
Reason: "ClusterOperatorUpdating",
423+
Message: "Working towards <unknown>: waiting on co-not-timeout",
424+
},
425+
},
426+
{
427+
name: "single UpdateEffectNone error and machine-config timeout",
428+
args: args{
429+
syncWorkerStatus: &SyncWorkerStatus{
430+
Failure: &payload.UpdateError{
431+
UpdateEffect: payload.UpdateEffectNone,
432+
Reason: "ClusterOperatorUpdating",
433+
Message: "Cluster operator A is updating",
434+
Name: "co-timeout",
435+
},
436+
},
437+
},
438+
expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{
439+
Type: ClusterStatusFailing,
440+
Status: configv1.ConditionTrue,
441+
Reason: "ClusterOperatorUpdating",
442+
Message: "Cluster operator A is updating",
443+
},
444+
shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true,
445+
expectedConditionModified: &configv1.ClusterOperatorStatusCondition{
446+
Type: ClusterStatusFailing,
447+
Status: configv1.ConditionFalse,
448+
},
449+
expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{
450+
Type: configv1.OperatorProgressing,
451+
Status: configv1.ConditionTrue,
452+
Reason: "ClusterOperatorUpdating",
453+
Message: "Working towards <unknown>: waiting on co-timeout over 30 minutes which is longer than expected",
454+
},
455+
},
456+
{
457+
name: "single UpdateEffectNone error and timeout",
458+
args: args{
459+
syncWorkerStatus: &SyncWorkerStatus{
460+
Failure: &payload.UpdateError{
461+
UpdateEffect: payload.UpdateEffectNone,
462+
Reason: "ClusterOperatorUpdating",
463+
Message: "Cluster operator A is updating",
464+
Name: "machine-config",
465+
},
466+
},
467+
},
468+
expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{
469+
Type: ClusterStatusFailing,
470+
Status: configv1.ConditionTrue,
471+
Reason: "ClusterOperatorUpdating",
472+
Message: "Cluster operator A is updating",
473+
},
474+
shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true,
475+
expectedConditionModified: &configv1.ClusterOperatorStatusCondition{
476+
Type: ClusterStatusFailing,
477+
Status: configv1.ConditionFalse,
478+
},
479+
expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{
480+
Type: configv1.OperatorProgressing,
481+
Status: configv1.ConditionTrue,
482+
Reason: "ClusterOperatorUpdating",
483+
Message: "Working towards <unknown>: waiting on machine-config over 90 minutes which is longer than expected",
484+
},
415485
},
416486
{
417487
name: "single condensed UpdateEffectFail UpdateError",
@@ -621,6 +691,13 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t
621691
if diff := cmp.Diff(expectedCondition, condition, ignoreLastTransitionTime); diff != "" {
622692
t.Errorf("unexpected condition when Reconciling == %t && isHistoryEmpty == %t\n:%s", c.isReconciling, c.isHistoryEmpty, diff)
623693
}
694+
695+
if tc.expectedProgressingCondition != nil && !c.isReconciling && !c.isHistoryEmpty {
696+
progressingCondition := resourcemerge.FindOperatorStatusCondition(cvStatus.Conditions, configv1.OperatorProgressing)
697+
if diff := cmp.Diff(tc.expectedProgressingCondition, progressingCondition, ignoreLastTransitionTime); diff != "" {
698+
t.Errorf("unexpected progressingCondition when Reconciling == %t && isHistoryEmpty == %t\n:%s", c.isReconciling, c.isHistoryEmpty, diff)
699+
}
700+
}
624701
}
625702
})
626703
}

0 commit comments

Comments
 (0)