Skip to content

Commit

Permalink
increase waitForFallbackDegradedConditionTimeout
Browse files Browse the repository at this point in the history
  • Loading branch information
wangke19 committed Feb 6, 2025
1 parent fef044d commit 707728f
Showing 1 changed file with 33 additions and 17 deletions.
50 changes: 33 additions & 17 deletions test/e2e-sno-disruptive/sno_disruptive_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,62 +23,72 @@ import (
func TestFallback(tt *testing.T) {
t := commontesthelpers.NewE(tt)
cs := getClients(t)
ctx := context.TODO()

t.Log("Starting the fallback test")
clusterStateWaitPollTimeout, clusterMustBeReadyFor, waitForFallbackDegradedConditionTimeout := fallbackTimeoutsForCurrentPlatform(t, cs)
clusterStateWaitPollTimeout, clusterMustBeReadyForBeforeTest, clusterMustBeReadyFor, waitForFallbackDegradedConditionTimeout := fallbackTimeoutsForCurrentPlatform(t, cs)

// before starting a new test make sure the current state of the cluster is good
ensureClusterInGoodState(t, cs, clusterStateWaitPollTimeout, clusterMustBeReadyFor)
ensureClusterInGoodState(ctx, t, cs, clusterStateWaitPollTimeout, clusterMustBeReadyForBeforeTest)

// cause a disruption
cfg := getDefaultUnsupportedConfigForCurrentPlatform(t, cs)
cfg["apiServerArguments"] = map[string][]string{"non-existing-flag": {"true"}}
setUnsupportedConfig(t, cs, cfg)

// validate if the fallback condition is reported and the cluster is stable
waitForFallbackDegradedCondition(t, cs, waitForFallbackDegradedConditionTimeout)
waitForFallbackDegradedCondition(ctx, t, cs, waitForFallbackDegradedConditionTimeout)
nodeName, failedRevision := assertFallbackOnNodeStatus(t, cs)
assertKasPodAnnotatedOnNode(t, cs, failedRevision, nodeName)

// clean up
// clean up and some extra time is needed to wait for the KAS operator to be ready
setUnsupportedConfig(t, cs, getDefaultUnsupportedConfigForCurrentPlatform(t, cs))
err := waitForClusterInGoodState(t, cs, clusterStateWaitPollTimeout, clusterMustBeReadyFor)
err := waitForClusterInGoodState(ctx, t, cs, clusterStateWaitPollTimeout, clusterMustBeReadyFor)
require.NoError(t, err)
}

// ensureClusterInGoodState makes sure the cluster is not progressing for mustBeReadyFor period
// in addition in an HA env it applies getDefaultUnsupportedConfigForCurrentPlatform so that the feature is enabled before the tests starts
func ensureClusterInGoodState(t testing.TB, cs clientSet, waitPollTimeout, mustBeReadyFor time.Duration) {
func ensureClusterInGoodState(ctx context.Context, t testing.TB, cs clientSet, waitPollTimeout, mustBeReadyFor time.Duration) {
setUnsupportedConfig(t, cs, getDefaultUnsupportedConfigForCurrentPlatform(t, cs))
err := waitForClusterInGoodState(t, cs, waitPollTimeout, mustBeReadyFor)
err := waitForClusterInGoodState(ctx, t, cs, waitPollTimeout, mustBeReadyFor)
require.NoError(t, err)
}

// waitForClusterInGoodState checks if the cluster is not progressing
func waitForClusterInGoodState(t testing.TB, cs clientSet, waitPollTimeout, mustBeReadyFor time.Duration) error {
func waitForClusterInGoodState(ctx context.Context, t testing.TB, cs clientSet, waitPollTimeout, mustBeReadyFor time.Duration) error {
t.Helper()

startTs := time.Now()
t.Logf("Waiting %s for the cluster to be in a good condition, interval = 10s, timeout %v", mustBeReadyFor.String(), waitPollTimeout)
t.Logf("Waiting %s for the cluster to be in a good condition, interval = 20s, timeout %v", mustBeReadyFor.String(), waitPollTimeout)

return wait.Poll(10*time.Second, waitPollTimeout, func() (bool, error) {
ckaso, err := cs.Operator.Get(context.TODO(), "cluster", metav1.GetOptions{})
return wait.PollUntilContextTimeout(ctx, 20*time.Second, waitPollTimeout, true, func(cxt context.Context) (bool, error) {
ckaso, err := cs.Operator.Get(ctx, "cluster", metav1.GetOptions{})
if err != nil {
t.Log(err)
return false, nil /*retry*/
}

// Check if any node is still progressing
for _, ns := range ckaso.Status.NodeStatuses {
if ckaso.Status.LatestAvailableRevision != ns.CurrentRevision || ns.TargetRevision > 0 {
t.Logf("Node %s is progressing, latestAvailableRevision: %v, currentRevision: %v, targetRevision: %v", ns.NodeName, ckaso.Status.LatestAvailableRevision, ns.CurrentRevision, ns.TargetRevision)
t.Logf("Node %s is progressing, latestAvailableRevision: %v, currentRevision: %v, targetRevision: %v",
ns.NodeName, ckaso.Status.LatestAvailableRevision, ns.CurrentRevision, ns.TargetRevision)
return false, nil /*retry*/
}
}

if time.Since(startTs) > mustBeReadyFor {
// Verify operator conditions
ckasoAvailable := v1helpers.IsOperatorConditionTrue(ckaso.Status.Conditions, "StaticPodsAvailable")
ckasoNotProgressing := v1helpers.IsOperatorConditionFalse(ckaso.Status.Conditions, "NodeInstallerProgressing")
ckasoNotDegraded := v1helpers.IsOperatorConditionFalse(ckaso.Status.Conditions, "NodeControllerDegraded")

// If cluster has been stable for the required time, return success
if time.Since(startTs) > mustBeReadyFor && ckasoAvailable && ckasoNotProgressing && ckasoNotDegraded {
t.Logf("The cluster has been in good condition for %s", mustBeReadyFor.String())
return true, nil /*done*/
}

return false, nil /*wait a bit more*/
})
}
Expand Down Expand Up @@ -108,11 +118,11 @@ func setUnsupportedConfig(t testing.TB, cs clientSet, cfg map[string]interface{}
}

// waitForFallbackDegradedCondition waits until StaticPodFallbackRevisionDegraded condition is set to true
func waitForFallbackDegradedCondition(t testing.TB, cs clientSet, waitPollTimeout time.Duration) {
func waitForFallbackDegradedCondition(ctx context.Context, t testing.TB, cs clientSet, waitPollTimeout time.Duration) {
t.Helper()

t.Logf("Waiting for StaticPodFallbackRevisionDegraded condition, interval = 20s, timeout = %v", waitPollTimeout)
err := wait.Poll(20*time.Second, waitPollTimeout, func() (bool, error) {
err := wait.PollUntilContextTimeout(ctx, 20*time.Second, waitPollTimeout, true, func(cxt context.Context) (bool, error) {
ckaso, err := cs.Operator.Get(context.TODO(), "cluster", metav1.GetOptions{})
if err != nil {
t.Logf("unable to get kube-apiserver-operator resource: %v", err)
Expand Down Expand Up @@ -220,14 +230,17 @@ func getDefaultUnsupportedConfigForCurrentPlatform(t testing.TB, cs clientSet) m
// fallbackTimeoutsForCurrentPlatform provides various timeouts that are tailored for the current platform
// TODO: add timeouts for AWS and GCP
// TODO: we should be able to return only a single per-platform specific timeout and derive the rest e.g. oneNodeRolloutTimeout
func fallbackTimeoutsForCurrentPlatform(t testing.TB, cs clientSet) (time.Duration, time.Duration, time.Duration) {
func fallbackTimeoutsForCurrentPlatform(t testing.TB, cs clientSet) (time.Duration, time.Duration, time.Duration, time.Duration) {
/*
default timeouts that apply when the test is run on an SNO cluster
clusterStateWaitPollInterval: is the max time after the cluster is considered not ready
it should match waitForFallbackDegradedConditionTimeout
because we don't know when the previous test finished
clusterMustBeReadyForBeforeTest: the time that make sure the current state of the cluster is good
before starting a new test
clusterMustBeReadyFor: the time the cluster must stay stable
waitForFallbackDegradedConditionTimeout: set to 10 min, it should be much lower
Expand All @@ -236,5 +249,8 @@ func fallbackTimeoutsForCurrentPlatform(t testing.TB, cs clientSet) (time.Durati
including the time the server needs to become ready and be noticed by a Load Balancer
longer duration allows as to collect logs and the must-gather
*/
return 10 * time.Minute /*clusterStateWaitPollInterval*/, 1 * time.Minute /*clusterMustBeReadyFor*/, 10 * time.Minute /*waitForFallbackDegradedConditionTimeout*/
return 10 * time.Minute, // clusterStateWaitPollInterval
1 * time.Minute, // clusterMustBeReadyForBeforeTest
5 * time.Minute, // clusterMustBeReadyFor
18 * time.Minute // waitForFallbackDegradedConditionTimeout
}

0 comments on commit 707728f

Please sign in to comment.