Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

release-24.2: roachtest: mark polled VM preemptions as non reportable #141137

Merged
merged 2 commits into from
Feb 11, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions pkg/cmd/roachtest/test_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -1115,6 +1115,9 @@ func (r *testRunner) runTest(
// Note that this error message is referred for test selection in
// pkg/cmd/roachtest/testselector/snowflake_query.sql.
failureMsg = fmt.Sprintf("VMs preempted during the test run: %s\n\n**Other Failures:**\n%s", preemptedVMNames, failureMsg)
// Reset the failures as a timeout may have suppressed failures, but we
// want to propagate the preemption error and avoid creating an issue.
t.resetFailures()
t.Error(vmPreemptionError(preemptedVMNames))
}
hostErrorVMNames := getHostErrorVMNames(ctx, c, l)
Expand Down Expand Up @@ -2027,11 +2030,15 @@ func monitorForPreemptedVMs(ctx context.Context, t test.Test, c cluster.Cluster,
continue
}

// If we find any preemptions, fail the test. Note that we will recheck for
// preemptions in post failure processing, which will correctly assign this
// failure as an infra flake.
// If we find any preemptions, fail the test. Note that while we will recheck for
// preemptions in post failure processing, we need to mark the test as a preemption
// failure here in case the recheck says there were no preemptions.
if len(preemptedVMs) != 0 {
t.Errorf("monitorForPreemptedVMs: Preempted VMs detected: %s", preemptedVMs)
var vmNames []string
for _, preemptedVM := range preemptedVMs {
vmNames = append(vmNames, preemptedVM.Name)
}
t.Errorf("monitorForPreemptedVMs detected VM Preemptions: %s", vmPreemptionError(getVMNames(vmNames)))
}
}
}
Expand Down
63 changes: 63 additions & 0 deletions pkg/cmd/roachtest/test_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -654,4 +654,67 @@ func TestVMPreemptionPolling(t *testing.T) {
// be treated as a flake instead of a failed test.
require.NoError(t, err)
})

// Test that if VM preemption polling finds a preempted VM but the post test failure
// check doesn't, the test is still marked as a flake.
t.Run("post test check doesn't catch preemption", func(t *testing.T) {
setPollPreemptionInterval(10 * time.Millisecond)
testPreemptedCh := make(chan struct{})
getPreemptedVMsHook = func(c cluster.Cluster, ctx context.Context, l *logger.Logger) ([]vm.PreemptedVM, error) {
preemptedVMs := []vm.PreemptedVM{{
Name: "test_node",
PreemptedAt: time.Now(),
}}
close(testPreemptedCh)
return preemptedVMs, nil
}

mockTest.Run = func(ctx context.Context, t test.Test, c cluster.Cluster) {
defer func() {
getPreemptedVMsHook = func(c cluster.Cluster, ctx context.Context, l *logger.Logger) ([]vm.PreemptedVM, error) {
return nil, nil
}
}()
// Make sure the preemption polling is called and the test context is cancelled
// before unblocking. Under stress, the test may time out before the preemption
// check is called otherwise.
<-testPreemptedCh
<-ctx.Done()
}

err := runner.Run(ctx, []registry.TestSpec{mockTest}, 1, /* count */
defaultParallelism, copt, testOpts{}, lopt)

require.NoError(t, err)
})

// Test that if the test hangs until timeout, a VM preemption will still be caught.
t.Run("test hangs and still catches preemption", func(t *testing.T) {
// We don't want the polling to cancel the test early.
setPollPreemptionInterval(10 * time.Minute)
getPreemptedVMsHook = func(c cluster.Cluster, ctx context.Context, l *logger.Logger) ([]vm.PreemptedVM, error) {
preemptedVMs := []vm.PreemptedVM{{
Name: "test_node",
PreemptedAt: time.Now(),
}}
return preemptedVMs, nil
}

mockTest.Timeout = 10 * time.Millisecond
// We expect the following to occur:
// 1. The test blocks on the context, which is only cancelled when the test runner
// returns after test completion. This effectively blocks the test forever.
// 2. The test times out and the test runner marks it as failed.
// 3. Normally, this would result in a failed test and runner.Run returning an error.
// However, because we injected a preemption, the test runner marks it as a flake
// instead and returns no errors.
mockTest.Run = func(ctx context.Context, t test.Test, c cluster.Cluster) {
<-ctx.Done()
}

err := runner.Run(ctx, []registry.TestSpec{mockTest}, 1, /* count */
defaultParallelism, copt, testOpts{}, lopt)

require.NoError(t, err)
})
}