@@ -23,62 +23,72 @@ import (
23
23
func TestFallback (tt * testing.T ) {
24
24
t := commontesthelpers .NewE (tt )
25
25
cs := getClients (t )
26
+ ctx := context .TODO ()
26
27
27
28
t .Log ("Starting the fallback test" )
28
- clusterStateWaitPollTimeout , clusterMustBeReadyFor , waitForFallbackDegradedConditionTimeout := fallbackTimeoutsForCurrentPlatform (t , cs )
29
+ clusterStateWaitPollTimeout , clusterMustBeReadyForBeforeTest , clusterMustBeReadyFor , waitForFallbackDegradedConditionTimeout := fallbackTimeoutsForCurrentPlatform (t , cs )
29
30
30
31
// before starting a new test make sure the current state of the cluster is good
31
- ensureClusterInGoodState (t , cs , clusterStateWaitPollTimeout , clusterMustBeReadyFor )
32
+ ensureClusterInGoodState (ctx , t , cs , clusterStateWaitPollTimeout , clusterMustBeReadyForBeforeTest )
32
33
33
34
// cause a disruption
34
35
cfg := getDefaultUnsupportedConfigForCurrentPlatform (t , cs )
35
36
cfg ["apiServerArguments" ] = map [string ][]string {"non-existing-flag" : {"true" }}
36
37
setUnsupportedConfig (t , cs , cfg )
37
38
38
39
// validate if the fallback condition is reported and the cluster is stable
39
- waitForFallbackDegradedCondition (t , cs , waitForFallbackDegradedConditionTimeout )
40
+ waitForFallbackDegradedCondition (ctx , t , cs , waitForFallbackDegradedConditionTimeout )
40
41
nodeName , failedRevision := assertFallbackOnNodeStatus (t , cs )
41
42
assertKasPodAnnotatedOnNode (t , cs , failedRevision , nodeName )
42
43
43
- // clean up
44
+ // clean up and some extra time is needed to wait for the KAS operator to be ready
44
45
setUnsupportedConfig (t , cs , getDefaultUnsupportedConfigForCurrentPlatform (t , cs ))
45
- err := waitForClusterInGoodState (t , cs , clusterStateWaitPollTimeout , clusterMustBeReadyFor )
46
+ err := waitForClusterInGoodState (ctx , t , cs , clusterStateWaitPollTimeout , clusterMustBeReadyFor )
46
47
require .NoError (t , err )
47
48
}
48
49
49
50
// ensureClusterInGoodState makes sure the cluster is not progressing for mustBeReadyFor period
50
51
// in addition in an HA env it applies getDefaultUnsupportedConfigForCurrentPlatform so that the feature is enabled before the tests starts
51
- func ensureClusterInGoodState (t testing.TB , cs clientSet , waitPollTimeout , mustBeReadyFor time.Duration ) {
52
+ func ensureClusterInGoodState (ctx context. Context , t testing.TB , cs clientSet , waitPollTimeout , mustBeReadyFor time.Duration ) {
52
53
setUnsupportedConfig (t , cs , getDefaultUnsupportedConfigForCurrentPlatform (t , cs ))
53
- err := waitForClusterInGoodState (t , cs , waitPollTimeout , mustBeReadyFor )
54
+ err := waitForClusterInGoodState (ctx , t , cs , waitPollTimeout , mustBeReadyFor )
54
55
require .NoError (t , err )
55
56
}
56
57
57
58
// waitForClusterInGoodState checks if the cluster is not progressing
58
- func waitForClusterInGoodState (t testing.TB , cs clientSet , waitPollTimeout , mustBeReadyFor time.Duration ) error {
59
+ func waitForClusterInGoodState (ctx context. Context , t testing.TB , cs clientSet , waitPollTimeout , mustBeReadyFor time.Duration ) error {
59
60
t .Helper ()
60
61
61
62
startTs := time .Now ()
62
- t .Logf ("Waiting %s for the cluster to be in a good condition, interval = 10s , timeout %v" , mustBeReadyFor .String (), waitPollTimeout )
63
+ t .Logf ("Waiting %s for the cluster to be in a good condition, interval = 20s , timeout %v" , mustBeReadyFor .String (), waitPollTimeout )
63
64
64
- return wait .Poll ( 10 * time .Second , waitPollTimeout , func () (bool , error ) {
65
- ckaso , err := cs .Operator .Get (context . TODO () , "cluster" , metav1.GetOptions {})
65
+ return wait .PollUntilContextTimeout ( ctx , 20 * time .Second , waitPollTimeout , true , func (cxt context. Context ) (bool , error ) {
66
+ ckaso , err := cs .Operator .Get (ctx , "cluster" , metav1.GetOptions {})
66
67
if err != nil {
67
68
t .Log (err )
68
69
return false , nil /*retry*/
69
70
}
70
71
72
+ // Check if any node is still progressing
71
73
for _ , ns := range ckaso .Status .NodeStatuses {
72
74
if ckaso .Status .LatestAvailableRevision != ns .CurrentRevision || ns .TargetRevision > 0 {
73
- t .Logf ("Node %s is progressing, latestAvailableRevision: %v, currentRevision: %v, targetRevision: %v" , ns .NodeName , ckaso .Status .LatestAvailableRevision , ns .CurrentRevision , ns .TargetRevision )
75
+ t .Logf ("Node %s is progressing, latestAvailableRevision: %v, currentRevision: %v, targetRevision: %v" ,
76
+ ns .NodeName , ckaso .Status .LatestAvailableRevision , ns .CurrentRevision , ns .TargetRevision )
74
77
return false , nil /*retry*/
75
78
}
76
79
}
77
80
78
- if time .Since (startTs ) > mustBeReadyFor {
81
+ // Verify operator conditions
82
+ ckasoAvailable := v1helpers .IsOperatorConditionTrue (ckaso .Status .Conditions , "StaticPodsAvailable" )
83
+ ckasoNotProgressing := v1helpers .IsOperatorConditionFalse (ckaso .Status .Conditions , "NodeInstallerProgressing" )
84
+ ckasoNotDegraded := v1helpers .IsOperatorConditionFalse (ckaso .Status .Conditions , "NodeControllerDegraded" )
85
+
86
+ // If cluster has been stable for the required time, return success
87
+ if time .Since (startTs ) > mustBeReadyFor && ckasoAvailable && ckasoNotProgressing && ckasoNotDegraded {
79
88
t .Logf ("The cluster has been in good condition for %s" , mustBeReadyFor .String ())
80
89
return true , nil /*done*/
81
90
}
91
+
82
92
return false , nil /*wait a bit more*/
83
93
})
84
94
}
@@ -108,11 +118,11 @@ func setUnsupportedConfig(t testing.TB, cs clientSet, cfg map[string]interface{}
108
118
}
109
119
110
120
// waitForFallbackDegradedCondition waits until StaticPodFallbackRevisionDegraded condition is set to true
111
- func waitForFallbackDegradedCondition (t testing.TB , cs clientSet , waitPollTimeout time.Duration ) {
121
+ func waitForFallbackDegradedCondition (ctx context. Context , t testing.TB , cs clientSet , waitPollTimeout time.Duration ) {
112
122
t .Helper ()
113
123
114
124
t .Logf ("Waiting for StaticPodFallbackRevisionDegraded condition, interval = 20s, timeout = %v" , waitPollTimeout )
115
- err := wait .Poll ( 20 * time .Second , waitPollTimeout , func () (bool , error ) {
125
+ err := wait .PollUntilContextTimeout ( ctx , 20 * time .Second , waitPollTimeout , true , func (cxt context. Context ) (bool , error ) {
116
126
ckaso , err := cs .Operator .Get (context .TODO (), "cluster" , metav1.GetOptions {})
117
127
if err != nil {
118
128
t .Logf ("unable to get kube-apiserver-operator resource: %v" , err )
@@ -220,21 +230,27 @@ func getDefaultUnsupportedConfigForCurrentPlatform(t testing.TB, cs clientSet) m
220
230
// fallbackTimeoutsForCurrentPlatform provides various timeouts that are tailored for the current platform
221
231
// TODO: add timeouts for AWS and GCP
222
232
// TODO: we should be able to return only a single per-platform specific timeout and derive the rest e.g. oneNodeRolloutTimeout
223
- func fallbackTimeoutsForCurrentPlatform (t testing.TB , cs clientSet ) (time.Duration , time.Duration , time.Duration ) {
233
+ func fallbackTimeoutsForCurrentPlatform (t testing.TB , cs clientSet ) (time.Duration , time.Duration , time.Duration , time. Duration ) {
224
234
/*
225
- default timeouts that apply when the test is run on an SNO cluster
235
+ default timeouts that apply when the test is run on an SNO cluster
236
+
237
+ clusterStateWaitPollInterval: is the max time after the cluster is considered not ready
238
+ it should match waitForFallbackDegradedConditionTimeout
239
+ because we don't know when the previous test finished
226
240
227
- clusterStateWaitPollInterval: is the max time after the cluster is considered not ready
228
- it should match waitForFallbackDegradedConditionTimeout
229
- because we don't know when the previous test finished
241
+ clusterMustBeReadyForBeforeTest: the time that make sure the current state of the cluster is good
242
+ before starting a new test
230
243
231
- clusterMustBeReadyFor: the time the cluster must stay stable
244
+ clusterMustBeReadyFor: the time the cluster must stay stable
232
245
233
- waitForFallbackDegradedConditionTimeout: set to 10 min, it should be much lower
234
- the static pod monitor needs 5 min to fallback to the previous revision
235
- but we don't know yet how much time it takes to start a new api server
236
- including the time the server needs to become ready and be noticed by a Load Balancer
237
- longer duration allows as to collect logs and the must-gather
246
+ waitForFallbackDegradedConditionTimeout: set to 10 min, it should be much lower
247
+ the static pod monitor needs 5 min to fallback to the previous revision
248
+ but we don't know yet how much time it takes to start a new api server
249
+ including the time the server needs to become ready and be noticed by a Load Balancer
250
+ longer duration allows as to collect logs and the must-gather
238
251
*/
239
- return 10 * time .Minute /*clusterStateWaitPollInterval*/ , 1 * time .Minute /*clusterMustBeReadyFor*/ , 10 * time .Minute /*waitForFallbackDegradedConditionTimeout*/
252
+ return 10 * time .Minute , // clusterStateWaitPollInterval
253
+ 1 * time .Minute , // clusterMustBeReadyForBeforeTest
254
+ 5 * time .Minute , // clusterMustBeReadyFor
255
+ 18 * time .Minute // waitForFallbackDegradedConditionTimeout
240
256
}
0 commit comments