Skip to content

Commit 02fad45

Browse files
committed
pkg/cvo/internal: Do not block on Degraded=True ClusterOperator
We have blocking on this condition since 545c342 (api: make status substruct on operatorstatus, 2018-10-15, #31) when it was Failing. We'd softened our install-time handling to act this way back in b0b4902 (clusteroperator: Don't block on failing during initialization, 2019-03-11, #136), motivated by install speed [1]. And a degraded operator may slow dependent components in their own transitions. But as long as the operator/operand are available at all, it should not block depndent components from transitioning, so this commit removes the Degraded=True block from the remaining modes. We still have the critical ClusterOperatorDegraded waking admins up when an operator goes Degraded=True for a while, we will just no longer block updates at that point. We won't block ReconcilingMode manifest application either, but since that's already flattened and permuted, and ClusterOperator tend to be towards the end of their TaskNode, the impact on ReconcilingMode is minimal (except that we will no longer go Failing=True in ClusterVersion when the only issue is some Degraded=True ClusterOperator). [1]: #136 (comment)
1 parent dc6f4f0 commit 02fad45

File tree

4 files changed

+9
-79
lines changed

4 files changed

+9
-79
lines changed

docs/user/reconciliation.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,6 @@ The ClusterOperator builder only monitors the in-cluster object and blocks until
135135
would block until the in-cluster ClusterOperator reported `operator` at version 4.1.0.
136136

137137
The progressing check is deprecated and will be removed once all operators are reporting versions.
138-
* Not degraded (except during initialization, where we ignore the degraded status)
139138

140139
### CustomResourceDefinition
141140

pkg/cvo/internal/operatorstatus.go

Lines changed: 6 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -169,63 +169,23 @@ func waitForOperatorStatusToBeDone(ctx context.Context, interval time.Duration,
169169
}
170170

171171
available := false
172+
degraded := true
172173
progressing := true
173-
failing := true
174-
var failingCondition *configv1.ClusterOperatorStatusCondition
175-
degradedValue := true
176-
var degradedCondition *configv1.ClusterOperatorStatusCondition
177174
for i := range actual.Status.Conditions {
178175
condition := &actual.Status.Conditions[i]
179176
switch {
180177
case condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionTrue:
181178
available = true
179+
case condition.Type == configv1.OperatorDegraded && condition.Status == configv1.ConditionFalse:
180+
degraded = false
182181
case condition.Type == configv1.OperatorProgressing && condition.Status == configv1.ConditionFalse:
183182
progressing = false
184-
case condition.Type == configv1.OperatorDegraded:
185-
if condition.Status == configv1.ConditionFalse {
186-
degradedValue = false
187-
}
188-
degradedCondition = condition
189-
}
190-
}
191-
192-
// If degraded was an explicitly set condition, use that. If not, use the deprecated failing.
193-
degraded := failing
194-
if degradedCondition != nil {
195-
degraded = degradedValue
196-
}
197-
198-
switch mode {
199-
case resourcebuilder.InitializingMode:
200-
// during initialization we allow degraded as long as the component goes available
201-
if available && (!progressing || len(expected.Status.Versions) > 0) {
202-
return true, nil
203-
}
204-
default:
205-
// if we're at the correct version, and available, and not degraded, we are done
206-
// if we're available, not degraded, and not progressing, we're also done
207-
// TODO: remove progressing once all cluster operators report expected versions
208-
if available && (!progressing || len(expected.Status.Versions) > 0) && !degraded {
209-
return true, nil
210183
}
211184
}
212185

213-
condition := failingCondition
214-
if degradedCondition != nil {
215-
condition = degradedCondition
216-
}
217-
if condition != nil && condition.Status == configv1.ConditionTrue {
218-
message := fmt.Sprintf("Cluster operator %s is reporting a failure", actual.Name)
219-
if len(condition.Message) > 0 {
220-
message = fmt.Sprintf("Cluster operator %s is reporting a failure: %s", actual.Name, condition.Message)
221-
}
222-
lastErr = &payload.UpdateError{
223-
Nested: errors.New(lowerFirst(message)),
224-
Reason: "ClusterOperatorDegraded",
225-
Message: message,
226-
Name: actual.Name,
227-
}
228-
return false, nil
186+
// TODO: remove progressing once all cluster operators report expected versions
187+
if available && (!progressing || len(expected.Status.Versions) > 0) {
188+
return true, nil
229189
}
230190

231191
lastErr = &payload.UpdateError{

pkg/cvo/internal/operatorstatus_test.go

Lines changed: 3 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -315,9 +315,9 @@ func Test_waitForOperatorStatusToBeDone(t *testing.T) {
315315
},
316316
},
317317
expErr: &payload.UpdateError{
318-
Nested: fmt.Errorf("cluster operator test-co is reporting a failure: random error"),
319-
Reason: "ClusterOperatorDegraded",
320-
Message: "Cluster operator test-co is reporting a failure: random error",
318+
Nested: fmt.Errorf("cluster operator test-co is not done; it is available=false, progressing=true, degraded=true"),
319+
Reason: "ClusterOperatorNotAvailable",
320+
Message: "Cluster operator test-co has not yet reported success",
321321
Name: "test-co",
322322
},
323323
}, {
@@ -343,12 +343,6 @@ func Test_waitForOperatorStatusToBeDone(t *testing.T) {
343343
}},
344344
},
345345
},
346-
expErr: &payload.UpdateError{
347-
Nested: fmt.Errorf("cluster operator test-co is not done; it is available=true, progressing=true, degraded=true"),
348-
Reason: "ClusterOperatorNotAvailable",
349-
Message: "Cluster operator test-co has not yet reported success",
350-
Name: "test-co",
351-
},
352346
}, {
353347
name: "cluster operator reporting available=true degraded=true",
354348
actual: &configv1.ClusterOperator{
@@ -372,12 +366,6 @@ func Test_waitForOperatorStatusToBeDone(t *testing.T) {
372366
}},
373367
},
374368
},
375-
expErr: &payload.UpdateError{
376-
Nested: fmt.Errorf("cluster operator test-co is reporting a failure: random error"),
377-
Reason: "ClusterOperatorDegraded",
378-
Message: "Cluster operator test-co is reporting a failure: random error",
379-
Name: "test-co",
380-
},
381369
}, {
382370
name: "cluster operator reporting available=true progressing=true degraded=true",
383371
actual: &configv1.ClusterOperator{
@@ -401,12 +389,6 @@ func Test_waitForOperatorStatusToBeDone(t *testing.T) {
401389
}},
402390
},
403391
},
404-
expErr: &payload.UpdateError{
405-
Nested: fmt.Errorf("cluster operator test-co is reporting a failure: random error"),
406-
Reason: "ClusterOperatorDegraded",
407-
Message: "Cluster operator test-co is reporting a failure: random error",
408-
Name: "test-co",
409-
},
410392
}, {
411393
name: "cluster operator reporting available=true no progressing or degraded",
412394
actual: &configv1.ClusterOperator{
@@ -430,12 +412,6 @@ func Test_waitForOperatorStatusToBeDone(t *testing.T) {
430412
}},
431413
},
432414
},
433-
expErr: &payload.UpdateError{
434-
Nested: fmt.Errorf("cluster operator test-co is not done; it is available=true, progressing=true, degraded=true"),
435-
Reason: "ClusterOperatorNotAvailable",
436-
Message: "Cluster operator test-co has not yet reported success",
437-
Name: "test-co",
438-
},
439415
}, {
440416
name: "cluster operator reporting available=true progressing=false degraded=false",
441417
actual: &configv1.ClusterOperator{

pkg/payload/task.go

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -195,11 +195,6 @@ func SummaryForReason(reason, name string) string {
195195
case "UpgradePreconditionCheckFailed":
196196
return "it may not be safe to apply this update"
197197

198-
case "ClusterOperatorDegraded":
199-
if len(name) > 0 {
200-
return fmt.Sprintf("the cluster operator %s is degraded", name)
201-
}
202-
return "a cluster operator is degraded"
203198
case "ClusterOperatorNotAvailable":
204199
if len(name) > 0 {
205200
return fmt.Sprintf("the cluster operator %s has not yet successfully rolled out", name)

0 commit comments

Comments
 (0)