OCPBUGS-23514: Better a message upon long CO updating

hongkailiu · hongkailiu · commit bd6ed4c5efcf · 2025-02-27T20:53:57.000-05:00
When it takes too long (90m+ for machine-config and 30m+ for others) to upgrade a cluster operator, clusterversion shows a message with the indication that the upgrade might hit some issue. This will cover the case in the related OCPBUGS-23538: for some reason, the pod under the deployment that manages the CO hit CrashLoopBackOff. Deployment controller does not give useful conditions in this situation [1]. Otherwise, checkDeploymentHealth [2] would detect it. Instead of CVO's figuring out the underlying pod's CrashLoopBackOff which might be better to be implemented by deployment controller, it is expected that our cluster admin starts to dig into the cluster when such a message pops up. For now, we just modify the condition's message. We could propagate Fail=True in case such requirements are collected from customers. [1]. kubernetes/kubernetes#106054 [2]. https://github.com/openshift/cluster-version-operator/blob/08c0459df5096e9f16fad3af2831b62d06d415ee/lib/resourcebuilder/apps.go#L79-L136
diff --git a/pkg/cvo/status.go b/pkg/cvo/status.go
@@ -555,6 +555,14 @@ func convertErrorToProgressing(now time.Time, statusFailure error) (reason strin
 	case payload.UpdateEffectReport:
 		return uErr.Reason, uErr.Error(), false
 	case payload.UpdateEffectNone:
+		m := time.Duration(30)
+		// It takes longer to upgrade MCO
+		if uErr.Name == "machine-config" {
+			m = 3 * m
+		}
+		if payload.COUpdateStartTimesGet(uErr.Name).Before(now.Add(-(m * time.Minute))) {
+			return uErr.Reason, fmt.Sprintf("waiting on %s over %d minutes which is longer than expected", uErr.Name, m), true
+		}
 		return uErr.Reason, fmt.Sprintf("waiting on %s", uErr.Name), true
 	case payload.UpdateEffectFail:
 		return "", "", false
diff --git a/pkg/cvo/status_test.go b/pkg/cvo/status_test.go
@@ -358,12 +358,15 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t
 	type args struct {
 		syncWorkerStatus *SyncWorkerStatus
 	}
+	payload.COUpdateStartTimesEnsure("co-not-timeout")
+	defer payload.COUpdateStartTimesRemove("co-not-timeout")
 	tests := []struct {
 		name                                             string
 		args                                             args
 		shouldModifyWhenNotReconcilingAndHistoryNotEmpty bool
 		expectedConditionNotModified                     *configv1.ClusterOperatorStatusCondition
 		expectedConditionModified                        *configv1.ClusterOperatorStatusCondition
+		expectedProgressingCondition                     *configv1.ClusterOperatorStatusCondition
 	}{
 		{
 			name: "no errors are present",
@@ -398,6 +401,7 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t
 						UpdateEffect: payload.UpdateEffectNone,
 						Reason:       "ClusterOperatorUpdating",
 						Message:      "Cluster operator A is updating",
+						Name:         "co-not-timeout",
 					},
 				},
 			},
@@ -412,6 +416,72 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t
 				Type:   ClusterStatusFailing,
 				Status: configv1.ConditionFalse,
 			},
+			expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{
+				Type:    configv1.OperatorProgressing,
+				Status:  configv1.ConditionTrue,
+				Reason:  "ClusterOperatorUpdating",
+				Message: "Working towards <unknown>: waiting on co-not-timeout",
+			},
+		},
+		{
+			name: "single UpdateEffectNone error and machine-config timeout",
+			args: args{
+				syncWorkerStatus: &SyncWorkerStatus{
+					Failure: &payload.UpdateError{
+						UpdateEffect: payload.UpdateEffectNone,
+						Reason:       "ClusterOperatorUpdating",
+						Message:      "Cluster operator A is updating",
+						Name:         "co-timeout",
+					},
+				},
+			},
+			expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{
+				Type:    ClusterStatusFailing,
+				Status:  configv1.ConditionTrue,
+				Reason:  "ClusterOperatorUpdating",
+				Message: "Cluster operator A is updating",
+			},
+			shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true,
+			expectedConditionModified: &configv1.ClusterOperatorStatusCondition{
+				Type:   ClusterStatusFailing,
+				Status: configv1.ConditionFalse,
+			},
+			expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{
+				Type:    configv1.OperatorProgressing,
+				Status:  configv1.ConditionTrue,
+				Reason:  "ClusterOperatorUpdating",
+				Message: "Working towards <unknown>: waiting on co-timeout over 30 minutes which is longer than expected",
+			},
+		},
+		{
+			name: "single UpdateEffectNone error and timeout",
+			args: args{
+				syncWorkerStatus: &SyncWorkerStatus{
+					Failure: &payload.UpdateError{
+						UpdateEffect: payload.UpdateEffectNone,
+						Reason:       "ClusterOperatorUpdating",
+						Message:      "Cluster operator A is updating",
+						Name:         "machine-config",
+					},
+				},
+			},
+			expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{
+				Type:    ClusterStatusFailing,
+				Status:  configv1.ConditionTrue,
+				Reason:  "ClusterOperatorUpdating",
+				Message: "Cluster operator A is updating",
+			},
+			shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true,
+			expectedConditionModified: &configv1.ClusterOperatorStatusCondition{
+				Type:   ClusterStatusFailing,
+				Status: configv1.ConditionFalse,
+			},
+			expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{
+				Type:    configv1.OperatorProgressing,
+				Status:  configv1.ConditionTrue,
+				Reason:  "ClusterOperatorUpdating",
+				Message: "Working towards <unknown>: waiting on machine-config over 90 minutes which is longer than expected",
+			},
 		},
 		{
 			name: "single condensed UpdateEffectFail UpdateError",
@@ -621,6 +691,13 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t
 				if diff := cmp.Diff(expectedCondition, condition, ignoreLastTransitionTime); diff != "" {
 					t.Errorf("unexpected condition when Reconciling == %t && isHistoryEmpty == %t\n:%s", c.isReconciling, c.isHistoryEmpty, diff)
 				}
+
+				if tc.expectedProgressingCondition != nil && !c.isReconciling && !c.isHistoryEmpty {
+					progressingCondition := resourcemerge.FindOperatorStatusCondition(cvStatus.Conditions, configv1.OperatorProgressing)
+					if diff := cmp.Diff(tc.expectedProgressingCondition, progressingCondition, ignoreLastTransitionTime); diff != "" {
+						t.Errorf("unexpected progressingCondition when Reconciling == %t && isHistoryEmpty == %t\n:%s", c.isReconciling, c.isHistoryEmpty, diff)
+					}
+				}
 			}
 		})
 	}