Skip to content

OTA-542: pkg/cvo/internal: Do not block on Degraded=True ClusterOperator #482

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 15 additions & 11 deletions pkg/cvo/internal/operatorstatus.go
Original file line number Diff line number Diff line change
Expand Up @@ -232,20 +232,15 @@ func checkOperatorHealth(ctx context.Context, client ClusterOperatorsGetter, exp
}
}

var degradedError error
if degraded {
if degradedCondition != nil && len(degradedCondition.Message) > 0 {
nestedMessage = fmt.Errorf("cluster operator %s is %s=%s: %s, %s", actual.Name, degradedCondition.Type, degradedCondition.Status, degradedCondition.Reason, degradedCondition.Message)
}
var updateEffect payload.UpdateEffectType

if mode == resourcebuilder.InitializingMode {
updateEffect = payload.UpdateEffectReport
} else {
updateEffect = payload.UpdateEffectFailAfterInterval
}
return &payload.UpdateError{
degradedError = &payload.UpdateError{
Nested: nestedMessage,
UpdateEffect: updateEffect,
UpdateEffect: payload.UpdateEffectReport,
Reason: "ClusterOperatorDegraded",
PluralReason: "ClusterOperatorsDegraded",
Message: fmt.Sprintf("Cluster operator %s is degraded", actual.Name),
Expand All @@ -256,10 +251,19 @@ func checkOperatorHealth(ctx context.Context, client ClusterOperatorsGetter, exp

// during initialization we allow undone versions
if len(undone) > 0 && mode != resourcebuilder.InitializingMode {
nestedMessage = fmt.Errorf("cluster operator %s is available and not degraded but has not finished updating to target version", actual.Name)
var updateEffect payload.UpdateEffectType

if degradedError == nil {
nestedMessage = fmt.Errorf("cluster operator %s is available and not degraded but has not finished updating to target version", actual.Name)
updateEffect = payload.UpdateEffectNone // block on versions, but nothing to go Failing=True over
} else {
nestedMessage = fmt.Errorf("cluster operator %s is available but %s=%s and has not finished updating to target version", actual.Name, degradedCondition.Type, degradedCondition.Status)
updateEffect = payload.UpdateEffectFail // block on versions, go Failing=True on Degraded=True
}

return &payload.UpdateError{
Nested: nestedMessage,
UpdateEffect: payload.UpdateEffectNone,
UpdateEffect: updateEffect,
Reason: "ClusterOperatorUpdating",
PluralReason: "ClusterOperatorsUpdating",
Message: fmt.Sprintf("Cluster operator %s is updating versions", actual.Name),
Expand All @@ -268,5 +272,5 @@ func checkOperatorHealth(ctx context.Context, client ClusterOperatorsGetter, exp
}
}

return nil
return degradedError
}
6 changes: 3 additions & 3 deletions pkg/cvo/internal/operatorstatus_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -432,7 +432,7 @@ func Test_checkOperatorHealth(t *testing.T) {
},
expErr: &payload.UpdateError{
Nested: fmt.Errorf("cluster operator test-co is Degraded=True: RandomReason, random error"),
UpdateEffect: payload.UpdateEffectFailAfterInterval,
UpdateEffect: payload.UpdateEffectReport,
Reason: "ClusterOperatorDegraded",
PluralReason: "ClusterOperatorsDegraded",
Message: "Cluster operator test-co is degraded",
Expand Down Expand Up @@ -504,7 +504,7 @@ func Test_checkOperatorHealth(t *testing.T) {
},
expErr: &payload.UpdateError{
Nested: fmt.Errorf("cluster operator test-co is Degraded=True: RandomReason, random error"),
UpdateEffect: payload.UpdateEffectFailAfterInterval,
UpdateEffect: payload.UpdateEffectReport,
Reason: "ClusterOperatorDegraded",
PluralReason: "ClusterOperatorsDegraded",
Message: "Cluster operator test-co is degraded",
Expand Down Expand Up @@ -573,7 +573,7 @@ func Test_checkOperatorHealth(t *testing.T) {
},
expErr: &payload.UpdateError{
Nested: fmt.Errorf("cluster operator test-co: available=true, progressing=true, degraded=true, undone="),
UpdateEffect: payload.UpdateEffectFailAfterInterval,
UpdateEffect: payload.UpdateEffectReport,
Reason: "ClusterOperatorDegraded",
PluralReason: "ClusterOperatorsDegraded",
Message: "Cluster operator test-co is degraded",
Expand Down
17 changes: 0 additions & 17 deletions pkg/cvo/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -552,23 +552,6 @@ func convertErrorToProgressing(now time.Time, statusFailure error) (reason strin
return uErr.Reason, fmt.Sprintf("waiting on %s", uErr.Name), true
case payload.UpdateEffectFail:
return "", "", false
case payload.UpdateEffectFailAfterInterval:
var exceeded []string
threshold := now.Add(-(40 * time.Minute))
names := uErr.Names
if len(names) == 0 {
names = []string{uErr.Name}
}
for _, name := range names {
if payload.COUpdateStartTimesGet(name).Before(threshold) {
exceeded = append(exceeded, name)
}
}
if len(exceeded) > 0 {
return uErr.Reason, fmt.Sprintf("wait has exceeded 40 minutes for these operators: %s", strings.Join(exceeded, ", ")), false
} else {
return uErr.Reason, fmt.Sprintf("waiting up to 40 minutes on %s", uErr.Name), true
}
}
return "", "", false
}
Expand Down
38 changes: 15 additions & 23 deletions pkg/cvo/status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -439,7 +439,7 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t
},
},
{
name: "MultipleErrors of UpdateEffectFail and UpdateEffectFailAfterInterval",
name: "MultipleErrors of UpdateEffectFail and UpdateEffectReport",
args: args{
syncWorkerStatus: &SyncWorkerStatus{
Failure: &payload.UpdateError{
Expand All @@ -450,14 +450,14 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t
Message: "Cluster operator A is not available",
},
&payload.UpdateError{
UpdateEffect: payload.UpdateEffectFailAfterInterval,
UpdateEffect: payload.UpdateEffectReport,
Reason: "ClusterOperatorDegraded",
Message: "Cluster operator B is degraded",
},
}),
UpdateEffect: payload.UpdateEffectFail,
Reason: "MultipleErrors",
Message: "Multiple errors are preventing progress:\n* Cluster operator A is not available\n* Cluster operator B is degraded",
Reason: "ClusterOperatorNotAvailable",
Message: "Cluster operator A is not available",
},
},
},
Expand Down Expand Up @@ -541,7 +541,7 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t
},
},
{
name: "MultipleErrors of UpdateEffectFail, UpdateEffectFailAfterInterval, and UpdateEffectNone",
name: "MultipleErrors of UpdateEffectFail, UpdateEffectReport, and UpdateEffectNone",
args: args{
syncWorkerStatus: &SyncWorkerStatus{
Failure: &payload.UpdateError{
Expand All @@ -557,14 +557,14 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t
Message: "Cluster operator B is updating versions",
},
&payload.UpdateError{
UpdateEffect: payload.UpdateEffectFailAfterInterval,
UpdateEffect: payload.UpdateEffectReport,
Reason: "ClusterOperatorDegraded",
Message: "Cluster operator C is degraded",
},
}),
UpdateEffect: payload.UpdateEffectFail,
Reason: "MultipleErrors",
Message: "Multiple errors are preventing progress:\n* Cluster operator A is not available\n* Cluster operator B is updating versions\n* Cluster operator C is degraded",
Message: "Multiple errors are preventing progress:\n* Cluster operator A is not available\n* Cluster operator B is updating versions",
},
},
},
Expand Down Expand Up @@ -664,10 +664,6 @@ func Test_filterOutUpdateErrors(t *testing.T) {
Name: "Report",
UpdateEffect: payload.UpdateEffectReport,
},
&payload.UpdateError{
Name: "Fail After Interval",
UpdateEffect: payload.UpdateEffectFailAfterInterval,
},
},
updateEffectType: payload.UpdateEffectNone,
},
Expand All @@ -680,19 +676,15 @@ func Test_filterOutUpdateErrors(t *testing.T) {
Name: "Report",
UpdateEffect: payload.UpdateEffectReport,
},
&payload.UpdateError{
Name: "Fail After Interval",
UpdateEffect: payload.UpdateEffectFailAfterInterval,
},
},
},
{
name: "errors contain update errors of the specified value UpdateEffectNone",
args: args{
errs: []error{
&payload.UpdateError{
Name: "Fail After Interval",
UpdateEffect: payload.UpdateEffectFailAfterInterval,
Name: "Fail",
UpdateEffect: payload.UpdateEffectFail,
},
&payload.UpdateError{
Name: "None #1",
Expand All @@ -711,8 +703,8 @@ func Test_filterOutUpdateErrors(t *testing.T) {
},
want: []error{
&payload.UpdateError{
Name: "Fail After Interval",
UpdateEffect: payload.UpdateEffectFailAfterInterval,
Name: "Fail",
UpdateEffect: payload.UpdateEffectFail,
},
&payload.UpdateError{
Name: "Report",
Expand All @@ -725,8 +717,8 @@ func Test_filterOutUpdateErrors(t *testing.T) {
args: args{
errs: []error{
&payload.UpdateError{
Name: "Fail After Interval",
UpdateEffect: payload.UpdateEffectFailAfterInterval,
Name: "Fail",
UpdateEffect: payload.UpdateEffectFail,
},
&payload.UpdateError{
Name: "None #1",
Expand All @@ -745,8 +737,8 @@ func Test_filterOutUpdateErrors(t *testing.T) {
},
want: []error{
&payload.UpdateError{
Name: "Fail After Interval",
UpdateEffect: payload.UpdateEffectFailAfterInterval,
Name: "Fail",
UpdateEffect: payload.UpdateEffectFail,
},
&payload.UpdateError{
Name: "None #1",
Expand Down
4 changes: 0 additions & 4 deletions pkg/cvo/sync_worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -1264,10 +1264,6 @@ func condenseClusterOperators(errs []error) []error {
if updateEffect == payload.UpdateEffectReport {
updateEffect = payload.UpdateEffectNone
}
case payload.UpdateEffectFailAfterInterval:
if updateEffect != payload.UpdateEffectFail {
updateEffect = payload.UpdateEffectFailAfterInterval
}
case payload.UpdateEffectFail:
updateEffect = payload.UpdateEffectFail
}
Expand Down
20 changes: 10 additions & 10 deletions pkg/cvo/sync_worker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -256,8 +256,8 @@ func Test_condenseClusterOperators(t *testing.T) {
Name: "test-co-B",
Task: task("test-co-B", configv1.GroupVersion.WithKind("ClusterOperator")),
}
coCDegradedFailAfterInterval := &payload.UpdateError{
UpdateEffect: payload.UpdateEffectFailAfterInterval,
coCDegraded := &payload.UpdateError{
UpdateEffect: payload.UpdateEffectReport,
Reason: "ClusterOperatorDegraded",
PluralReason: "ClusterOperatorsDegraded",
Message: "Cluster operator test-co-C is degraded",
Expand Down Expand Up @@ -318,10 +318,9 @@ func Test_condenseClusterOperators(t *testing.T) {
input: []error{
coBUpdating,
coAUpdating,
coCDegradedFailAfterInterval,
coCDegraded,
},
expected: []error{
coCDegradedFailAfterInterval, // Degraded sorts before Updating
&payload.UpdateError{
Nested: errors.NewAggregate([]error{coBUpdating, coAUpdating}),
UpdateEffect: payload.UpdateEffectNone,
Expand All @@ -332,17 +331,18 @@ func Test_condenseClusterOperators(t *testing.T) {
Name: "test-co-A, test-co-B",
Names: []string{"test-co-A", "test-co-B"},
},
coCDegraded,
},
}, {
name: "there ClusterOperator with the same reason but different effects",
input: []error{
coBDegradedFail,
coADegradedNone,
coCDegradedFailAfterInterval,
coCDegraded,
},
expected: []error{
&payload.UpdateError{
Nested: errors.NewAggregate([]error{coBDegradedFail, coADegradedNone, coCDegradedFailAfterInterval}),
Nested: errors.NewAggregate([]error{coBDegradedFail, coADegradedNone, coCDegraded}),
UpdateEffect: payload.UpdateEffectFail,
Reason: "ClusterOperatorsDegraded",
PluralReason: "ClusterOperatorsDegraded",
Expand All @@ -353,15 +353,15 @@ func Test_condenseClusterOperators(t *testing.T) {
},
},
}, {
name: "to ClusterOperator with the same reason but None and FailAfterInterval effects",
name: "to ClusterOperator with the same reason but None and Report effects",
input: []error{
coADegradedNone,
coCDegradedFailAfterInterval,
coCDegraded,
},
expected: []error{
&payload.UpdateError{
Nested: errors.NewAggregate([]error{coADegradedNone, coCDegradedFailAfterInterval}),
UpdateEffect: payload.UpdateEffectFailAfterInterval,
Nested: errors.NewAggregate([]error{coADegradedNone, coCDegraded}),
UpdateEffect: payload.UpdateEffectFail,
Reason: "ClusterOperatorsDegraded",
PluralReason: "ClusterOperatorsDegraded",
Message: "Cluster operators test-co-A, test-co-C are degraded",
Expand Down
4 changes: 0 additions & 4 deletions pkg/payload/task.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,10 +161,6 @@ const (

// UpdateEffectFail defines an error as indicating the update is failing.
UpdateEffectFail UpdateEffectType = "Fail"

// UpdateEffectFailAfterInterval defines an error as one which indicates the update
// is failing if the error continues for a defined interval.
UpdateEffectFailAfterInterval UpdateEffectType = "FailAfterInterval"
)

// UpdateError is a wrapper for errors that occur during a payload sync.
Expand Down