increase waitForFallbackDegradedConditionTimeout

wangke19 · wangke19 · commit 5d6451dedf7d · 2025-02-06T10:06:25.000+08:00
diff --git a/test/e2e-sno-disruptive/sno_disruptive_test.go b/test/e2e-sno-disruptive/sno_disruptive_test.go
@@ -23,62 +23,82 @@ import (
 func TestFallback(tt *testing.T) {
 	t := commontesthelpers.NewE(tt)
 	cs := getClients(t)
+	ctx, cancel := context.WithTimeout(context.Background(), 20*time.Minute)
+	defer cancel()
 
 	t.Log("Starting the fallback test")
 	clusterStateWaitPollTimeout, clusterMustBeReadyFor, waitForFallbackDegradedConditionTimeout := fallbackTimeoutsForCurrentPlatform(t, cs)
 
 	// before starting a new test make sure the current state of the cluster is good
-	ensureClusterInGoodState(t, cs, clusterStateWaitPollTimeout, clusterMustBeReadyFor)
+	ensureClusterInGoodState(ctx, t, cs, clusterStateWaitPollTimeout, clusterMustBeReadyFor)
 
 	// cause a disruption
 	cfg := getDefaultUnsupportedConfigForCurrentPlatform(t, cs)
 	cfg["apiServerArguments"] = map[string][]string{"non-existing-flag": {"true"}}
 	setUnsupportedConfig(t, cs, cfg)
 
 	// validate if the fallback condition is reported and the cluster is stable
-	waitForFallbackDegradedCondition(t, cs, waitForFallbackDegradedConditionTimeout)
+	waitForFallbackDegradedCondition(ctx, t, cs, waitForFallbackDegradedConditionTimeout)
 	nodeName, failedRevision := assertFallbackOnNodeStatus(t, cs)
 	assertKasPodAnnotatedOnNode(t, cs, failedRevision, nodeName)
 
-	// clean up
+	// clean up and some extra time is needed to wait for the KAS operator to be ready
 	setUnsupportedConfig(t, cs, getDefaultUnsupportedConfigForCurrentPlatform(t, cs))
-	err := waitForClusterInGoodState(t, cs, clusterStateWaitPollTimeout, clusterMustBeReadyFor)
+	err := waitForClusterInGoodState(ctx, t, cs, clusterStateWaitPollTimeout, 5*clusterMustBeReadyFor)
 	require.NoError(t, err)
 }
 
 // ensureClusterInGoodState makes sure the cluster is not progressing for mustBeReadyFor period
 // in addition in an HA env it applies getDefaultUnsupportedConfigForCurrentPlatform so that the feature is enabled before the tests starts
-func ensureClusterInGoodState(t testing.TB, cs clientSet, waitPollTimeout, mustBeReadyFor time.Duration) {
+func ensureClusterInGoodState(ctx context.Context, t testing.TB, cs clientSet, waitPollTimeout, mustBeReadyFor time.Duration) {
 	setUnsupportedConfig(t, cs, getDefaultUnsupportedConfigForCurrentPlatform(t, cs))
-	err := waitForClusterInGoodState(t, cs, waitPollTimeout, mustBeReadyFor)
+	err := waitForClusterInGoodState(ctx, t, cs, waitPollTimeout, mustBeReadyFor)
 	require.NoError(t, err)
 }
 
 // waitForClusterInGoodState checks if the cluster is not progressing
-func waitForClusterInGoodState(t testing.TB, cs clientSet, waitPollTimeout, mustBeReadyFor time.Duration) error {
+func waitForClusterInGoodState(ctx context.Context, t testing.TB, cs clientSet, waitPollTimeout, mustBeReadyFor time.Duration) error {
 	t.Helper()
 
 	startTs := time.Now()
-	t.Logf("Waiting %s for the cluster to be in a good condition, interval = 10s, timeout %v", mustBeReadyFor.String(), waitPollTimeout)
+	t.Logf("Waiting %s for the cluster to be in a good condition, interval = 20s, timeout %v", mustBeReadyFor.String(), waitPollTimeout)
+
+	return wait.PollUntilContextTimeout(ctx, 20*time.Second, waitPollTimeout, true, func(cxt context.Context) (bool, error) {
+		// Check if the context is already done to prevent unnecessary API calls
+		select {
+		case <-ctx.Done():
+			t.Logf("Context canceled before cluster reached good state: %v", ctx.Err())
+			return false, ctx.Err() // Return the context error to indicate cancellation
+		default:
+		}
 
-	return wait.Poll(10*time.Second, waitPollTimeout, func() (bool, error) {
-		ckaso, err := cs.Operator.Get(context.TODO(), "cluster", metav1.GetOptions{})
+		// Use ctx instead of context.TODO()
+		ckaso, err := cs.Operator.Get(ctx, "cluster", metav1.GetOptions{})
 		if err != nil {
 			t.Log(err)
 			return false, nil /*retry*/
 		}
 
+		// Check if any node is still progressing
 		for _, ns := range ckaso.Status.NodeStatuses {
 			if ckaso.Status.LatestAvailableRevision != ns.CurrentRevision || ns.TargetRevision > 0 {
-				t.Logf("Node %s is progressing, latestAvailableRevision: %v, currentRevision: %v, targetRevision: %v", ns.NodeName, ckaso.Status.LatestAvailableRevision, ns.CurrentRevision, ns.TargetRevision)
+				t.Logf("Node %s is progressing, latestAvailableRevision: %v, currentRevision: %v, targetRevision: %v",
+					ns.NodeName, ckaso.Status.LatestAvailableRevision, ns.CurrentRevision, ns.TargetRevision)
 				return false, nil /*retry*/
 			}
 		}
 
-		if time.Since(startTs) > mustBeReadyFor {
+		// Verify operator conditions
+		ckasoAvailable := v1helpers.IsOperatorConditionTrue(ckaso.Status.Conditions, "StaticPodsAvailable")
+		ckasoNotProgressing := v1helpers.IsOperatorConditionFalse(ckaso.Status.Conditions, "NodeInstallerProgressing")
+		ckasoNotDegraded := v1helpers.IsOperatorConditionFalse(ckaso.Status.Conditions, "NodeControllerDegraded")
+
+		// If cluster has been stable for the required time, return success
+		if time.Since(startTs) > mustBeReadyFor && ckasoAvailable && ckasoNotProgressing && ckasoNotDegraded {
 			t.Logf("The cluster has been in good condition for %s", mustBeReadyFor.String())
 			return true, nil /*done*/
 		}
+
 		return false, nil /*wait a bit more*/
 	})
 }
@@ -108,11 +128,11 @@ func setUnsupportedConfig(t testing.TB, cs clientSet, cfg map[string]interface{}
 }
 
 // waitForFallbackDegradedCondition waits until StaticPodFallbackRevisionDegraded condition is set to true
-func waitForFallbackDegradedCondition(t testing.TB, cs clientSet, waitPollTimeout time.Duration) {
+func waitForFallbackDegradedCondition(ctx context.Context, t testing.TB, cs clientSet, waitPollTimeout time.Duration) {
 	t.Helper()
 
 	t.Logf("Waiting for StaticPodFallbackRevisionDegraded condition, interval = 20s, timeout = %v", waitPollTimeout)
-	err := wait.Poll(20*time.Second, waitPollTimeout, func() (bool, error) {
+	err := wait.PollUntilContextTimeout(ctx, 20*time.Second, waitPollTimeout, true, func(cxt context.Context) (bool, error) {
 		ckaso, err := cs.Operator.Get(context.TODO(), "cluster", metav1.GetOptions{})
 		if err != nil {
 			t.Logf("unable to get kube-apiserver-operator resource: %v", err)
@@ -236,5 +256,5 @@ func fallbackTimeoutsForCurrentPlatform(t testing.TB, cs clientSet) (time.Durati
 	                                          including the time the server needs to become ready and be noticed by a Load Balancer
 	                                          longer duration allows as to collect logs and the must-gather
 	*/
-	return 10 * time.Minute /*clusterStateWaitPollInterval*/, 1 * time.Minute /*clusterMustBeReadyFor*/, 10 * time.Minute /*waitForFallbackDegradedConditionTimeout*/
+	return 10 * time.Minute /*clusterStateWaitPollInterval*/, 1 * time.Minute /*clusterMustBeReadyFor*/, 18 * time.Minute /*waitForFallbackDegradedConditionTimeout*/
 }