Skip to content

Commit 5d6451d

Browse files
committed
increase waitForFallbackDegradedConditionTimeout
1 parent fef044d commit 5d6451d

File tree

1 file changed

+35
-15
lines changed

1 file changed

+35
-15
lines changed

test/e2e-sno-disruptive/sno_disruptive_test.go

Lines changed: 35 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -23,62 +23,82 @@ import (
2323
func TestFallback(tt *testing.T) {
2424
t := commontesthelpers.NewE(tt)
2525
cs := getClients(t)
26+
ctx, cancel := context.WithTimeout(context.Background(), 20*time.Minute)
27+
defer cancel()
2628

2729
t.Log("Starting the fallback test")
2830
clusterStateWaitPollTimeout, clusterMustBeReadyFor, waitForFallbackDegradedConditionTimeout := fallbackTimeoutsForCurrentPlatform(t, cs)
2931

3032
// before starting a new test make sure the current state of the cluster is good
31-
ensureClusterInGoodState(t, cs, clusterStateWaitPollTimeout, clusterMustBeReadyFor)
33+
ensureClusterInGoodState(ctx, t, cs, clusterStateWaitPollTimeout, clusterMustBeReadyFor)
3234

3335
// cause a disruption
3436
cfg := getDefaultUnsupportedConfigForCurrentPlatform(t, cs)
3537
cfg["apiServerArguments"] = map[string][]string{"non-existing-flag": {"true"}}
3638
setUnsupportedConfig(t, cs, cfg)
3739

3840
// validate if the fallback condition is reported and the cluster is stable
39-
waitForFallbackDegradedCondition(t, cs, waitForFallbackDegradedConditionTimeout)
41+
waitForFallbackDegradedCondition(ctx, t, cs, waitForFallbackDegradedConditionTimeout)
4042
nodeName, failedRevision := assertFallbackOnNodeStatus(t, cs)
4143
assertKasPodAnnotatedOnNode(t, cs, failedRevision, nodeName)
4244

43-
// clean up
45+
// clean up and some extra time is needed to wait for the KAS operator to be ready
4446
setUnsupportedConfig(t, cs, getDefaultUnsupportedConfigForCurrentPlatform(t, cs))
45-
err := waitForClusterInGoodState(t, cs, clusterStateWaitPollTimeout, clusterMustBeReadyFor)
47+
err := waitForClusterInGoodState(ctx, t, cs, clusterStateWaitPollTimeout, 5*clusterMustBeReadyFor)
4648
require.NoError(t, err)
4749
}
4850

4951
// ensureClusterInGoodState makes sure the cluster is not progressing for mustBeReadyFor period
5052
// in addition in an HA env it applies getDefaultUnsupportedConfigForCurrentPlatform so that the feature is enabled before the tests starts
51-
func ensureClusterInGoodState(t testing.TB, cs clientSet, waitPollTimeout, mustBeReadyFor time.Duration) {
53+
func ensureClusterInGoodState(ctx context.Context, t testing.TB, cs clientSet, waitPollTimeout, mustBeReadyFor time.Duration) {
5254
setUnsupportedConfig(t, cs, getDefaultUnsupportedConfigForCurrentPlatform(t, cs))
53-
err := waitForClusterInGoodState(t, cs, waitPollTimeout, mustBeReadyFor)
55+
err := waitForClusterInGoodState(ctx, t, cs, waitPollTimeout, mustBeReadyFor)
5456
require.NoError(t, err)
5557
}
5658

5759
// waitForClusterInGoodState checks if the cluster is not progressing
58-
func waitForClusterInGoodState(t testing.TB, cs clientSet, waitPollTimeout, mustBeReadyFor time.Duration) error {
60+
func waitForClusterInGoodState(ctx context.Context, t testing.TB, cs clientSet, waitPollTimeout, mustBeReadyFor time.Duration) error {
5961
t.Helper()
6062

6163
startTs := time.Now()
62-
t.Logf("Waiting %s for the cluster to be in a good condition, interval = 10s, timeout %v", mustBeReadyFor.String(), waitPollTimeout)
64+
t.Logf("Waiting %s for the cluster to be in a good condition, interval = 20s, timeout %v", mustBeReadyFor.String(), waitPollTimeout)
65+
66+
return wait.PollUntilContextTimeout(ctx, 20*time.Second, waitPollTimeout, true, func(cxt context.Context) (bool, error) {
67+
// Check if the context is already done to prevent unnecessary API calls
68+
select {
69+
case <-ctx.Done():
70+
t.Logf("Context canceled before cluster reached good state: %v", ctx.Err())
71+
return false, ctx.Err() // Return the context error to indicate cancellation
72+
default:
73+
}
6374

64-
return wait.Poll(10*time.Second, waitPollTimeout, func() (bool, error) {
65-
ckaso, err := cs.Operator.Get(context.TODO(), "cluster", metav1.GetOptions{})
75+
// Use ctx instead of context.TODO()
76+
ckaso, err := cs.Operator.Get(ctx, "cluster", metav1.GetOptions{})
6677
if err != nil {
6778
t.Log(err)
6879
return false, nil /*retry*/
6980
}
7081

82+
// Check if any node is still progressing
7183
for _, ns := range ckaso.Status.NodeStatuses {
7284
if ckaso.Status.LatestAvailableRevision != ns.CurrentRevision || ns.TargetRevision > 0 {
73-
t.Logf("Node %s is progressing, latestAvailableRevision: %v, currentRevision: %v, targetRevision: %v", ns.NodeName, ckaso.Status.LatestAvailableRevision, ns.CurrentRevision, ns.TargetRevision)
85+
t.Logf("Node %s is progressing, latestAvailableRevision: %v, currentRevision: %v, targetRevision: %v",
86+
ns.NodeName, ckaso.Status.LatestAvailableRevision, ns.CurrentRevision, ns.TargetRevision)
7487
return false, nil /*retry*/
7588
}
7689
}
7790

78-
if time.Since(startTs) > mustBeReadyFor {
91+
// Verify operator conditions
92+
ckasoAvailable := v1helpers.IsOperatorConditionTrue(ckaso.Status.Conditions, "StaticPodsAvailable")
93+
ckasoNotProgressing := v1helpers.IsOperatorConditionFalse(ckaso.Status.Conditions, "NodeInstallerProgressing")
94+
ckasoNotDegraded := v1helpers.IsOperatorConditionFalse(ckaso.Status.Conditions, "NodeControllerDegraded")
95+
96+
// If cluster has been stable for the required time, return success
97+
if time.Since(startTs) > mustBeReadyFor && ckasoAvailable && ckasoNotProgressing && ckasoNotDegraded {
7998
t.Logf("The cluster has been in good condition for %s", mustBeReadyFor.String())
8099
return true, nil /*done*/
81100
}
101+
82102
return false, nil /*wait a bit more*/
83103
})
84104
}
@@ -108,11 +128,11 @@ func setUnsupportedConfig(t testing.TB, cs clientSet, cfg map[string]interface{}
108128
}
109129

110130
// waitForFallbackDegradedCondition waits until StaticPodFallbackRevisionDegraded condition is set to true
111-
func waitForFallbackDegradedCondition(t testing.TB, cs clientSet, waitPollTimeout time.Duration) {
131+
func waitForFallbackDegradedCondition(ctx context.Context, t testing.TB, cs clientSet, waitPollTimeout time.Duration) {
112132
t.Helper()
113133

114134
t.Logf("Waiting for StaticPodFallbackRevisionDegraded condition, interval = 20s, timeout = %v", waitPollTimeout)
115-
err := wait.Poll(20*time.Second, waitPollTimeout, func() (bool, error) {
135+
err := wait.PollUntilContextTimeout(ctx, 20*time.Second, waitPollTimeout, true, func(cxt context.Context) (bool, error) {
116136
ckaso, err := cs.Operator.Get(context.TODO(), "cluster", metav1.GetOptions{})
117137
if err != nil {
118138
t.Logf("unable to get kube-apiserver-operator resource: %v", err)
@@ -236,5 +256,5 @@ func fallbackTimeoutsForCurrentPlatform(t testing.TB, cs clientSet) (time.Durati
236256
including the time the server needs to become ready and be noticed by a Load Balancer
237257
longer duration allows as to collect logs and the must-gather
238258
*/
239-
return 10 * time.Minute /*clusterStateWaitPollInterval*/, 1 * time.Minute /*clusterMustBeReadyFor*/, 10 * time.Minute /*waitForFallbackDegradedConditionTimeout*/
259+
return 10 * time.Minute /*clusterStateWaitPollInterval*/, 1 * time.Minute /*clusterMustBeReadyFor*/, 18 * time.Minute /*waitForFallbackDegradedConditionTimeout*/
240260
}

0 commit comments

Comments
 (0)