Skip to content

Commit 308bfef

Browse files
committed
increase waitForFallbackDegradedConditionTimeout
1 parent fef044d commit 308bfef

File tree

1 file changed

+43
-27
lines changed

1 file changed

+43
-27
lines changed

test/e2e-sno-disruptive/sno_disruptive_test.go

Lines changed: 43 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -23,62 +23,72 @@ import (
2323
func TestFallback(tt *testing.T) {
2424
t := commontesthelpers.NewE(tt)
2525
cs := getClients(t)
26+
ctx := context.TODO()
2627

2728
t.Log("Starting the fallback test")
28-
clusterStateWaitPollTimeout, clusterMustBeReadyFor, waitForFallbackDegradedConditionTimeout := fallbackTimeoutsForCurrentPlatform(t, cs)
29+
clusterStateWaitPollTimeout, clusterMustBeReadyForBeforeTest, clusterMustBeReadyFor, waitForFallbackDegradedConditionTimeout := fallbackTimeoutsForCurrentPlatform(t, cs)
2930

3031
// before starting a new test make sure the current state of the cluster is good
31-
ensureClusterInGoodState(t, cs, clusterStateWaitPollTimeout, clusterMustBeReadyFor)
32+
ensureClusterInGoodState(ctx, t, cs, clusterStateWaitPollTimeout, clusterMustBeReadyForBeforeTest)
3233

3334
// cause a disruption
3435
cfg := getDefaultUnsupportedConfigForCurrentPlatform(t, cs)
3536
cfg["apiServerArguments"] = map[string][]string{"non-existing-flag": {"true"}}
3637
setUnsupportedConfig(t, cs, cfg)
3738

3839
// validate if the fallback condition is reported and the cluster is stable
39-
waitForFallbackDegradedCondition(t, cs, waitForFallbackDegradedConditionTimeout)
40+
waitForFallbackDegradedCondition(ctx, t, cs, waitForFallbackDegradedConditionTimeout)
4041
nodeName, failedRevision := assertFallbackOnNodeStatus(t, cs)
4142
assertKasPodAnnotatedOnNode(t, cs, failedRevision, nodeName)
4243

43-
// clean up
44+
// clean up and some extra time is needed to wait for the KAS operator to be ready
4445
setUnsupportedConfig(t, cs, getDefaultUnsupportedConfigForCurrentPlatform(t, cs))
45-
err := waitForClusterInGoodState(t, cs, clusterStateWaitPollTimeout, clusterMustBeReadyFor)
46+
err := waitForClusterInGoodState(ctx, t, cs, clusterStateWaitPollTimeout, clusterMustBeReadyFor)
4647
require.NoError(t, err)
4748
}
4849

4950
// ensureClusterInGoodState makes sure the cluster is not progressing for mustBeReadyFor period
5051
// in addition in an HA env it applies getDefaultUnsupportedConfigForCurrentPlatform so that the feature is enabled before the tests starts
51-
func ensureClusterInGoodState(t testing.TB, cs clientSet, waitPollTimeout, mustBeReadyFor time.Duration) {
52+
func ensureClusterInGoodState(ctx context.Context, t testing.TB, cs clientSet, waitPollTimeout, mustBeReadyFor time.Duration) {
5253
setUnsupportedConfig(t, cs, getDefaultUnsupportedConfigForCurrentPlatform(t, cs))
53-
err := waitForClusterInGoodState(t, cs, waitPollTimeout, mustBeReadyFor)
54+
err := waitForClusterInGoodState(ctx, t, cs, waitPollTimeout, mustBeReadyFor)
5455
require.NoError(t, err)
5556
}
5657

5758
// waitForClusterInGoodState checks if the cluster is not progressing
58-
func waitForClusterInGoodState(t testing.TB, cs clientSet, waitPollTimeout, mustBeReadyFor time.Duration) error {
59+
func waitForClusterInGoodState(ctx context.Context, t testing.TB, cs clientSet, waitPollTimeout, mustBeReadyFor time.Duration) error {
5960
t.Helper()
6061

6162
startTs := time.Now()
62-
t.Logf("Waiting %s for the cluster to be in a good condition, interval = 10s, timeout %v", mustBeReadyFor.String(), waitPollTimeout)
63+
t.Logf("Waiting %s for the cluster to be in a good condition, interval = 20s, timeout %v", mustBeReadyFor.String(), waitPollTimeout)
6364

64-
return wait.Poll(10*time.Second, waitPollTimeout, func() (bool, error) {
65-
ckaso, err := cs.Operator.Get(context.TODO(), "cluster", metav1.GetOptions{})
65+
return wait.PollUntilContextTimeout(ctx, 20*time.Second, waitPollTimeout, true, func(cxt context.Context) (bool, error) {
66+
ckaso, err := cs.Operator.Get(ctx, "cluster", metav1.GetOptions{})
6667
if err != nil {
6768
t.Log(err)
6869
return false, nil /*retry*/
6970
}
7071

72+
// Check if any node is still progressing
7173
for _, ns := range ckaso.Status.NodeStatuses {
7274
if ckaso.Status.LatestAvailableRevision != ns.CurrentRevision || ns.TargetRevision > 0 {
73-
t.Logf("Node %s is progressing, latestAvailableRevision: %v, currentRevision: %v, targetRevision: %v", ns.NodeName, ckaso.Status.LatestAvailableRevision, ns.CurrentRevision, ns.TargetRevision)
75+
t.Logf("Node %s is progressing, latestAvailableRevision: %v, currentRevision: %v, targetRevision: %v",
76+
ns.NodeName, ckaso.Status.LatestAvailableRevision, ns.CurrentRevision, ns.TargetRevision)
7477
return false, nil /*retry*/
7578
}
7679
}
7780

78-
if time.Since(startTs) > mustBeReadyFor {
81+
// Verify operator conditions
82+
ckasoAvailable := v1helpers.IsOperatorConditionTrue(ckaso.Status.Conditions, "StaticPodsAvailable")
83+
ckasoNotProgressing := v1helpers.IsOperatorConditionFalse(ckaso.Status.Conditions, "NodeInstallerProgressing")
84+
ckasoNotDegraded := v1helpers.IsOperatorConditionFalse(ckaso.Status.Conditions, "NodeControllerDegraded")
85+
86+
// If cluster has been stable for the required time, return success
87+
if time.Since(startTs) > mustBeReadyFor && ckasoAvailable && ckasoNotProgressing && ckasoNotDegraded {
7988
t.Logf("The cluster has been in good condition for %s", mustBeReadyFor.String())
8089
return true, nil /*done*/
8190
}
91+
8292
return false, nil /*wait a bit more*/
8393
})
8494
}
@@ -108,11 +118,11 @@ func setUnsupportedConfig(t testing.TB, cs clientSet, cfg map[string]interface{}
108118
}
109119

110120
// waitForFallbackDegradedCondition waits until StaticPodFallbackRevisionDegraded condition is set to true
111-
func waitForFallbackDegradedCondition(t testing.TB, cs clientSet, waitPollTimeout time.Duration) {
121+
func waitForFallbackDegradedCondition(ctx context.Context, t testing.TB, cs clientSet, waitPollTimeout time.Duration) {
112122
t.Helper()
113123

114124
t.Logf("Waiting for StaticPodFallbackRevisionDegraded condition, interval = 20s, timeout = %v", waitPollTimeout)
115-
err := wait.Poll(20*time.Second, waitPollTimeout, func() (bool, error) {
125+
err := wait.PollUntilContextTimeout(ctx, 20*time.Second, waitPollTimeout, true, func(cxt context.Context) (bool, error) {
116126
ckaso, err := cs.Operator.Get(context.TODO(), "cluster", metav1.GetOptions{})
117127
if err != nil {
118128
t.Logf("unable to get kube-apiserver-operator resource: %v", err)
@@ -220,21 +230,27 @@ func getDefaultUnsupportedConfigForCurrentPlatform(t testing.TB, cs clientSet) m
220230
// fallbackTimeoutsForCurrentPlatform provides various timeouts that are tailored for the current platform
221231
// TODO: add timeouts for AWS and GCP
222232
// TODO: we should be able to return only a single per-platform specific timeout and derive the rest e.g. oneNodeRolloutTimeout
223-
func fallbackTimeoutsForCurrentPlatform(t testing.TB, cs clientSet) (time.Duration, time.Duration, time.Duration) {
233+
func fallbackTimeoutsForCurrentPlatform(t testing.TB, cs clientSet) (time.Duration, time.Duration, time.Duration, time.Duration) {
224234
/*
225-
default timeouts that apply when the test is run on an SNO cluster
235+
default timeouts that apply when the test is run on an SNO cluster
236+
237+
clusterStateWaitPollInterval: is the max time after the cluster is considered not ready
238+
it should match waitForFallbackDegradedConditionTimeout
239+
because we don't know when the previous test finished
226240
227-
clusterStateWaitPollInterval: is the max time after the cluster is considered not ready
228-
it should match waitForFallbackDegradedConditionTimeout
229-
because we don't know when the previous test finished
241+
clusterMustBeReadyForBeforeTest: the time that make sure the current state of the cluster is good
242+
before starting a new test
230243
231-
clusterMustBeReadyFor: the time the cluster must stay stable
244+
clusterMustBeReadyFor: the time the cluster must stay stable
232245
233-
waitForFallbackDegradedConditionTimeout: set to 10 min, it should be much lower
234-
the static pod monitor needs 5 min to fallback to the previous revision
235-
but we don't know yet how much time it takes to start a new api server
236-
including the time the server needs to become ready and be noticed by a Load Balancer
237-
longer duration allows as to collect logs and the must-gather
246+
waitForFallbackDegradedConditionTimeout: set to 10 min, it should be much lower
247+
the static pod monitor needs 5 min to fallback to the previous revision
248+
but we don't know yet how much time it takes to start a new api server
249+
including the time the server needs to become ready and be noticed by a Load Balancer
250+
longer duration allows as to collect logs and the must-gather
238251
*/
239-
return 10 * time.Minute /*clusterStateWaitPollInterval*/, 1 * time.Minute /*clusterMustBeReadyFor*/, 10 * time.Minute /*waitForFallbackDegradedConditionTimeout*/
252+
return 10 * time.Minute, // clusterStateWaitPollInterval
253+
1 * time.Minute, // clusterMustBeReadyForBeforeTest
254+
5 * time.Minute, // clusterMustBeReadyFor
255+
18 * time.Minute // waitForFallbackDegradedConditionTimeout
240256
}

0 commit comments

Comments
 (0)