Skip to content

Commit b01f931

Browse files
Merge pull request #1165 from hongkailiu/OCPBUGS-23514
OCPBUGS-23514: Failing=Unknown upon long CO updating
2 parents 1c90cbb + 8892f42 commit b01f931

File tree

3 files changed

+336
-15
lines changed

3 files changed

+336
-15
lines changed

pkg/cvo/status.go

Lines changed: 63 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,14 @@ func updateClusterVersionStatus(cvStatus *configv1.ClusterVersionStatus, status
327327
failingCondition.Reason = failingReason
328328
failingCondition.Message = failingMessage
329329
}
330+
if failure != nil &&
331+
strings.HasPrefix(progressReason, slowCOUpdatePrefix) {
332+
failingCondition.Status = configv1.ConditionUnknown
333+
failingCondition.Reason = "SlowClusterOperator"
334+
failingCondition.Message = progressMessage
335+
}
336+
progressReason = strings.TrimPrefix(progressReason, slowCOUpdatePrefix)
337+
330338
resourcemerge.SetOperatorStatusCondition(&cvStatus.Conditions, failingCondition)
331339

332340
// update progressing
@@ -537,6 +545,8 @@ func setDesiredReleaseAcceptedCondition(cvStatus *configv1.ClusterVersionStatus,
537545
}
538546
}
539547

548+
const slowCOUpdatePrefix = "Slow::"
549+
540550
// convertErrorToProgressing returns true if the provided status indicates a failure condition can be interpreted as
541551
// still making internal progress. The general error we try to suppress is an operator or operators still being
542552
// progressing AND the general payload task making progress towards its goal. The error's UpdateEffect determines
@@ -555,28 +565,67 @@ func convertErrorToProgressing(now time.Time, statusFailure error) (reason strin
555565
case payload.UpdateEffectReport:
556566
return uErr.Reason, uErr.Error(), false
557567
case payload.UpdateEffectNone:
558-
return uErr.Reason, fmt.Sprintf("waiting on %s", uErr.Name), true
568+
return convertErrorToProgressingForUpdateEffectNone(uErr, now)
559569
case payload.UpdateEffectFail:
560570
return "", "", false
561571
case payload.UpdateEffectFailAfterInterval:
562-
var exceeded []string
563-
threshold := now.Add(-(40 * time.Minute))
564-
names := uErr.Names
565-
if len(names) == 0 {
566-
names = []string{uErr.Name}
567-
}
568-
for _, name := range names {
569-
if payload.COUpdateStartTimesGet(name).Before(threshold) {
572+
return convertErrorToProgressingForUpdateEffectFailAfterInterval(uErr, now)
573+
}
574+
return "", "", false
575+
}
576+
577+
func convertErrorToProgressingForUpdateEffectNone(uErr *payload.UpdateError, now time.Time) (string, string, bool) {
578+
var exceeded []string
579+
names := uErr.Names
580+
if len(names) == 0 {
581+
names = []string{uErr.Name}
582+
}
583+
var machineConfig bool
584+
for _, name := range names {
585+
m := 30 * time.Minute
586+
// It takes longer to upgrade MCO
587+
if name == "machine-config" {
588+
m = 3 * m
589+
}
590+
t := payload.COUpdateStartTimesGet(name)
591+
if (!t.IsZero()) && t.Before(now.Add(-(m))) {
592+
if name == "machine-config" {
593+
machineConfig = true
594+
} else {
570595
exceeded = append(exceeded, name)
571596
}
572597
}
573-
if len(exceeded) > 0 {
574-
return uErr.Reason, fmt.Sprintf("wait has exceeded 40 minutes for these operators: %s", strings.Join(exceeded, ", ")), false
575-
} else {
576-
return uErr.Reason, fmt.Sprintf("waiting up to 40 minutes on %s", uErr.Name), true
598+
}
599+
// returns true in those slow cases because it is still only a suspicion
600+
if len(exceeded) > 0 && !machineConfig {
601+
return slowCOUpdatePrefix + uErr.Reason, fmt.Sprintf("waiting on %s over 30 minutes which is longer than expected", strings.Join(exceeded, ", ")), true
602+
}
603+
if len(exceeded) > 0 && machineConfig {
604+
return slowCOUpdatePrefix + uErr.Reason, fmt.Sprintf("waiting on %s over 30 minutes and machine-config over 90 minutes which is longer than expected", strings.Join(exceeded, ", ")), true
605+
}
606+
if len(exceeded) == 0 && machineConfig {
607+
return slowCOUpdatePrefix + uErr.Reason, "waiting on machine-config over 90 minutes which is longer than expected", true
608+
}
609+
return uErr.Reason, fmt.Sprintf("waiting on %s", strings.Join(names, ", ")), true
610+
}
611+
612+
func convertErrorToProgressingForUpdateEffectFailAfterInterval(uErr *payload.UpdateError, now time.Time) (string, string, bool) {
613+
var exceeded []string
614+
threshold := now.Add(-(40 * time.Minute))
615+
names := uErr.Names
616+
if len(names) == 0 {
617+
names = []string{uErr.Name}
618+
}
619+
for _, name := range names {
620+
if payload.COUpdateStartTimesGet(name).Before(threshold) {
621+
exceeded = append(exceeded, name)
577622
}
578623
}
579-
return "", "", false
624+
if len(exceeded) > 0 {
625+
return uErr.Reason, fmt.Sprintf("wait has exceeded 40 minutes for these operators: %s", strings.Join(exceeded, ", ")), false
626+
} else {
627+
return uErr.Reason, fmt.Sprintf("waiting up to 40 minutes on %s", uErr.Name), true
628+
}
580629
}
581630

582631
// syncFailingStatus handles generic errors in the cluster version. It tries to preserve

0 commit comments

Comments
 (0)