Skip to content

Commit b198db0

Browse files
committed
pkg/cvo/internal: Do not block on Degraded=True ClusterOperator
We have blocking on this condition since 545c342 (api: make status substruct on operatorstatus, 2018-10-15, #31) when it was Failing. We'd softened our install-time handling to act this way back in b0b4902 (clusteroperator: Don't block on failing during initialization, 2019-03-11, #136), motivated by install speed [1]. And a degraded operator may slow dependent components in their own transitions. But as long as the operator/operand are available at all, it should not block depndent components from transitioning, so this commit removes the Degraded=True block from the remaining modes. We still have the warning ClusterOperatorDegraded alerting admins when an operator goes Degraded=True for a while, we will just no longer block updates at that point. We won't block ReconcilingMode manifest application either, but since that's already flattened and permuted, and ClusterOperator tend to be towards the end of their TaskNode, the impact on ReconcilingMode is minimal (except that we will no longer go Failing=True in ClusterVersion when the only issue is some Degraded=True ClusterOperator). [1]: #136 (comment)
1 parent b0eddfe commit b198db0

File tree

7 files changed

+43
-72
lines changed

7 files changed

+43
-72
lines changed

pkg/cvo/internal/operatorstatus.go

+15-11
Original file line numberDiff line numberDiff line change
@@ -232,20 +232,15 @@ func checkOperatorHealth(ctx context.Context, client ClusterOperatorsGetter, exp
232232
}
233233
}
234234

235+
var degradedError error
235236
if degraded {
236237
if degradedCondition != nil && len(degradedCondition.Message) > 0 {
237238
nestedMessage = fmt.Errorf("cluster operator %s is %s=%s: %s, %s", actual.Name, degradedCondition.Type, degradedCondition.Status, degradedCondition.Reason, degradedCondition.Message)
238239
}
239-
var updateEffect payload.UpdateEffectType
240240

241-
if mode == resourcebuilder.InitializingMode {
242-
updateEffect = payload.UpdateEffectReport
243-
} else {
244-
updateEffect = payload.UpdateEffectFailAfterInterval
245-
}
246-
return &payload.UpdateError{
241+
degradedError = &payload.UpdateError{
247242
Nested: nestedMessage,
248-
UpdateEffect: updateEffect,
243+
UpdateEffect: payload.UpdateEffectReport,
249244
Reason: "ClusterOperatorDegraded",
250245
PluralReason: "ClusterOperatorsDegraded",
251246
Message: fmt.Sprintf("Cluster operator %s is degraded", actual.Name),
@@ -256,10 +251,19 @@ func checkOperatorHealth(ctx context.Context, client ClusterOperatorsGetter, exp
256251

257252
// during initialization we allow undone versions
258253
if len(undone) > 0 && mode != resourcebuilder.InitializingMode {
259-
nestedMessage = fmt.Errorf("cluster operator %s is available and not degraded but has not finished updating to target version", actual.Name)
254+
var updateEffect payload.UpdateEffectType
255+
256+
if degradedError == nil {
257+
nestedMessage = fmt.Errorf("cluster operator %s is available and not degraded but has not finished updating to target version", actual.Name)
258+
updateEffect = payload.UpdateEffectNone // block on versions, but nothing to go Failing=True over
259+
} else {
260+
nestedMessage = fmt.Errorf("cluster operator %s is available but %s=%s and has not finished updating to target version", actual.Name, degradedCondition.Type, degradedCondition.Status)
261+
updateEffect = payload.UpdateEffectFail // block on versions, go Failing=True on Degraded=True
262+
}
263+
260264
return &payload.UpdateError{
261265
Nested: nestedMessage,
262-
UpdateEffect: payload.UpdateEffectNone,
266+
UpdateEffect: updateEffect,
263267
Reason: "ClusterOperatorUpdating",
264268
PluralReason: "ClusterOperatorsUpdating",
265269
Message: fmt.Sprintf("Cluster operator %s is updating versions", actual.Name),
@@ -268,5 +272,5 @@ func checkOperatorHealth(ctx context.Context, client ClusterOperatorsGetter, exp
268272
}
269273
}
270274

271-
return nil
275+
return degradedError
272276
}

pkg/cvo/internal/operatorstatus_test.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -432,7 +432,7 @@ func Test_checkOperatorHealth(t *testing.T) {
432432
},
433433
expErr: &payload.UpdateError{
434434
Nested: fmt.Errorf("cluster operator test-co is Degraded=True: RandomReason, random error"),
435-
UpdateEffect: payload.UpdateEffectFailAfterInterval,
435+
UpdateEffect: payload.UpdateEffectReport,
436436
Reason: "ClusterOperatorDegraded",
437437
PluralReason: "ClusterOperatorsDegraded",
438438
Message: "Cluster operator test-co is degraded",
@@ -504,7 +504,7 @@ func Test_checkOperatorHealth(t *testing.T) {
504504
},
505505
expErr: &payload.UpdateError{
506506
Nested: fmt.Errorf("cluster operator test-co is Degraded=True: RandomReason, random error"),
507-
UpdateEffect: payload.UpdateEffectFailAfterInterval,
507+
UpdateEffect: payload.UpdateEffectReport,
508508
Reason: "ClusterOperatorDegraded",
509509
PluralReason: "ClusterOperatorsDegraded",
510510
Message: "Cluster operator test-co is degraded",
@@ -573,7 +573,7 @@ func Test_checkOperatorHealth(t *testing.T) {
573573
},
574574
expErr: &payload.UpdateError{
575575
Nested: fmt.Errorf("cluster operator test-co: available=true, progressing=true, degraded=true, undone="),
576-
UpdateEffect: payload.UpdateEffectFailAfterInterval,
576+
UpdateEffect: payload.UpdateEffectReport,
577577
Reason: "ClusterOperatorDegraded",
578578
PluralReason: "ClusterOperatorsDegraded",
579579
Message: "Cluster operator test-co is degraded",

pkg/cvo/status.go

-17
Original file line numberDiff line numberDiff line change
@@ -552,23 +552,6 @@ func convertErrorToProgressing(now time.Time, statusFailure error) (reason strin
552552
return uErr.Reason, fmt.Sprintf("waiting on %s", uErr.Name), true
553553
case payload.UpdateEffectFail:
554554
return "", "", false
555-
case payload.UpdateEffectFailAfterInterval:
556-
var exceeded []string
557-
threshold := now.Add(-(40 * time.Minute))
558-
names := uErr.Names
559-
if len(names) == 0 {
560-
names = []string{uErr.Name}
561-
}
562-
for _, name := range names {
563-
if payload.COUpdateStartTimesGet(name).Before(threshold) {
564-
exceeded = append(exceeded, name)
565-
}
566-
}
567-
if len(exceeded) > 0 {
568-
return uErr.Reason, fmt.Sprintf("wait has exceeded 40 minutes for these operators: %s", strings.Join(exceeded, ", ")), false
569-
} else {
570-
return uErr.Reason, fmt.Sprintf("waiting up to 40 minutes on %s", uErr.Name), true
571-
}
572555
}
573556
return "", "", false
574557
}

pkg/cvo/status_test.go

+15-23
Original file line numberDiff line numberDiff line change
@@ -439,7 +439,7 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t
439439
},
440440
},
441441
{
442-
name: "MultipleErrors of UpdateEffectFail and UpdateEffectFailAfterInterval",
442+
name: "MultipleErrors of UpdateEffectFail and UpdateEffectReport",
443443
args: args{
444444
syncWorkerStatus: &SyncWorkerStatus{
445445
Failure: &payload.UpdateError{
@@ -450,14 +450,14 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t
450450
Message: "Cluster operator A is not available",
451451
},
452452
&payload.UpdateError{
453-
UpdateEffect: payload.UpdateEffectFailAfterInterval,
453+
UpdateEffect: payload.UpdateEffectReport,
454454
Reason: "ClusterOperatorDegraded",
455455
Message: "Cluster operator B is degraded",
456456
},
457457
}),
458458
UpdateEffect: payload.UpdateEffectFail,
459-
Reason: "MultipleErrors",
460-
Message: "Multiple errors are preventing progress:\n* Cluster operator A is not available\n* Cluster operator B is degraded",
459+
Reason: "ClusterOperatorNotAvailable",
460+
Message: "Cluster operator A is not available",
461461
},
462462
},
463463
},
@@ -541,7 +541,7 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t
541541
},
542542
},
543543
{
544-
name: "MultipleErrors of UpdateEffectFail, UpdateEffectFailAfterInterval, and UpdateEffectNone",
544+
name: "MultipleErrors of UpdateEffectFail, UpdateEffectReport, and UpdateEffectNone",
545545
args: args{
546546
syncWorkerStatus: &SyncWorkerStatus{
547547
Failure: &payload.UpdateError{
@@ -557,14 +557,14 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t
557557
Message: "Cluster operator B is updating versions",
558558
},
559559
&payload.UpdateError{
560-
UpdateEffect: payload.UpdateEffectFailAfterInterval,
560+
UpdateEffect: payload.UpdateEffectReport,
561561
Reason: "ClusterOperatorDegraded",
562562
Message: "Cluster operator C is degraded",
563563
},
564564
}),
565565
UpdateEffect: payload.UpdateEffectFail,
566566
Reason: "MultipleErrors",
567-
Message: "Multiple errors are preventing progress:\n* Cluster operator A is not available\n* Cluster operator B is updating versions\n* Cluster operator C is degraded",
567+
Message: "Multiple errors are preventing progress:\n* Cluster operator A is not available\n* Cluster operator B is updating versions",
568568
},
569569
},
570570
},
@@ -664,10 +664,6 @@ func Test_filterOutUpdateErrors(t *testing.T) {
664664
Name: "Report",
665665
UpdateEffect: payload.UpdateEffectReport,
666666
},
667-
&payload.UpdateError{
668-
Name: "Fail After Interval",
669-
UpdateEffect: payload.UpdateEffectFailAfterInterval,
670-
},
671667
},
672668
updateEffectType: payload.UpdateEffectNone,
673669
},
@@ -680,19 +676,15 @@ func Test_filterOutUpdateErrors(t *testing.T) {
680676
Name: "Report",
681677
UpdateEffect: payload.UpdateEffectReport,
682678
},
683-
&payload.UpdateError{
684-
Name: "Fail After Interval",
685-
UpdateEffect: payload.UpdateEffectFailAfterInterval,
686-
},
687679
},
688680
},
689681
{
690682
name: "errors contain update errors of the specified value UpdateEffectNone",
691683
args: args{
692684
errs: []error{
693685
&payload.UpdateError{
694-
Name: "Fail After Interval",
695-
UpdateEffect: payload.UpdateEffectFailAfterInterval,
686+
Name: "Fail",
687+
UpdateEffect: payload.UpdateEffectFail,
696688
},
697689
&payload.UpdateError{
698690
Name: "None #1",
@@ -711,8 +703,8 @@ func Test_filterOutUpdateErrors(t *testing.T) {
711703
},
712704
want: []error{
713705
&payload.UpdateError{
714-
Name: "Fail After Interval",
715-
UpdateEffect: payload.UpdateEffectFailAfterInterval,
706+
Name: "Fail",
707+
UpdateEffect: payload.UpdateEffectFail,
716708
},
717709
&payload.UpdateError{
718710
Name: "Report",
@@ -725,8 +717,8 @@ func Test_filterOutUpdateErrors(t *testing.T) {
725717
args: args{
726718
errs: []error{
727719
&payload.UpdateError{
728-
Name: "Fail After Interval",
729-
UpdateEffect: payload.UpdateEffectFailAfterInterval,
720+
Name: "Fail",
721+
UpdateEffect: payload.UpdateEffectFail,
730722
},
731723
&payload.UpdateError{
732724
Name: "None #1",
@@ -745,8 +737,8 @@ func Test_filterOutUpdateErrors(t *testing.T) {
745737
},
746738
want: []error{
747739
&payload.UpdateError{
748-
Name: "Fail After Interval",
749-
UpdateEffect: payload.UpdateEffectFailAfterInterval,
740+
Name: "Fail",
741+
UpdateEffect: payload.UpdateEffectFail,
750742
},
751743
&payload.UpdateError{
752744
Name: "None #1",

pkg/cvo/sync_worker.go

-4
Original file line numberDiff line numberDiff line change
@@ -1264,10 +1264,6 @@ func condenseClusterOperators(errs []error) []error {
12641264
if updateEffect == payload.UpdateEffectReport {
12651265
updateEffect = payload.UpdateEffectNone
12661266
}
1267-
case payload.UpdateEffectFailAfterInterval:
1268-
if updateEffect != payload.UpdateEffectFail {
1269-
updateEffect = payload.UpdateEffectFailAfterInterval
1270-
}
12711267
case payload.UpdateEffectFail:
12721268
updateEffect = payload.UpdateEffectFail
12731269
}

pkg/cvo/sync_worker_test.go

+10-10
Original file line numberDiff line numberDiff line change
@@ -256,8 +256,8 @@ func Test_condenseClusterOperators(t *testing.T) {
256256
Name: "test-co-B",
257257
Task: task("test-co-B", configv1.GroupVersion.WithKind("ClusterOperator")),
258258
}
259-
coCDegradedFailAfterInterval := &payload.UpdateError{
260-
UpdateEffect: payload.UpdateEffectFailAfterInterval,
259+
coCDegraded := &payload.UpdateError{
260+
UpdateEffect: payload.UpdateEffectReport,
261261
Reason: "ClusterOperatorDegraded",
262262
PluralReason: "ClusterOperatorsDegraded",
263263
Message: "Cluster operator test-co-C is degraded",
@@ -318,10 +318,9 @@ func Test_condenseClusterOperators(t *testing.T) {
318318
input: []error{
319319
coBUpdating,
320320
coAUpdating,
321-
coCDegradedFailAfterInterval,
321+
coCDegraded,
322322
},
323323
expected: []error{
324-
coCDegradedFailAfterInterval, // Degraded sorts before Updating
325324
&payload.UpdateError{
326325
Nested: errors.NewAggregate([]error{coBUpdating, coAUpdating}),
327326
UpdateEffect: payload.UpdateEffectNone,
@@ -332,17 +331,18 @@ func Test_condenseClusterOperators(t *testing.T) {
332331
Name: "test-co-A, test-co-B",
333332
Names: []string{"test-co-A", "test-co-B"},
334333
},
334+
coCDegraded,
335335
},
336336
}, {
337337
name: "there ClusterOperator with the same reason but different effects",
338338
input: []error{
339339
coBDegradedFail,
340340
coADegradedNone,
341-
coCDegradedFailAfterInterval,
341+
coCDegraded,
342342
},
343343
expected: []error{
344344
&payload.UpdateError{
345-
Nested: errors.NewAggregate([]error{coBDegradedFail, coADegradedNone, coCDegradedFailAfterInterval}),
345+
Nested: errors.NewAggregate([]error{coBDegradedFail, coADegradedNone, coCDegraded}),
346346
UpdateEffect: payload.UpdateEffectFail,
347347
Reason: "ClusterOperatorsDegraded",
348348
PluralReason: "ClusterOperatorsDegraded",
@@ -353,15 +353,15 @@ func Test_condenseClusterOperators(t *testing.T) {
353353
},
354354
},
355355
}, {
356-
name: "to ClusterOperator with the same reason but None and FailAfterInterval effects",
356+
name: "to ClusterOperator with the same reason but None and Report effects",
357357
input: []error{
358358
coADegradedNone,
359-
coCDegradedFailAfterInterval,
359+
coCDegraded,
360360
},
361361
expected: []error{
362362
&payload.UpdateError{
363-
Nested: errors.NewAggregate([]error{coADegradedNone, coCDegradedFailAfterInterval}),
364-
UpdateEffect: payload.UpdateEffectFailAfterInterval,
363+
Nested: errors.NewAggregate([]error{coADegradedNone, coCDegraded}),
364+
UpdateEffect: payload.UpdateEffectFail,
365365
Reason: "ClusterOperatorsDegraded",
366366
PluralReason: "ClusterOperatorsDegraded",
367367
Message: "Cluster operators test-co-A, test-co-C are degraded",

pkg/payload/task.go

-4
Original file line numberDiff line numberDiff line change
@@ -161,10 +161,6 @@ const (
161161

162162
// UpdateEffectFail defines an error as indicating the update is failing.
163163
UpdateEffectFail UpdateEffectType = "Fail"
164-
165-
// UpdateEffectFailAfterInterval defines an error as one which indicates the update
166-
// is failing if the error continues for a defined interval.
167-
UpdateEffectFailAfterInterval UpdateEffectType = "FailAfterInterval"
168164
)
169165

170166
// UpdateError is a wrapper for errors that occur during a payload sync.

0 commit comments

Comments
 (0)