Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: deletion of vmss when provisioning fails during e2es #3945

Merged
merged 4 commits into from
Jan 3, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion .pipelines/e2e.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,27 @@ jobs:
export BUILD_ID=$(Build.BuildId)

cd e2e
go test -timeout 45m -v -run Test_All ./
go test -timeout 90m -v -run Test_All ./
displayName: Run AgentBaker E2E
env:
VHD_BUILD_ID: $(VHD_BUILD_ID)
ADO_PAT: $(ADO_PAT)
- publish: $(System.DefaultWorkingDirectory)/e2e/scenario-logs
artifact: scenario-logs
condition: always()
- bash: |
set -x

vmssResourceIds=""
for vmssModel in e2e/scenario-logs/*/vmssId.txt; do
resourceId=$(cat ${vmssModel})
vmssResourceIds="${vmssResourceIds} ${resourceId}"
done

if [ -n "${vmssResourceIds// }" ]; then
az resource delete --ids ${vmssResourceIds}
fi
displayName: ensure vmss deletion
condition: always()


4 changes: 2 additions & 2 deletions e2e/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,12 @@ Furthermore, `SCENARIOS_TO_EXCLUDE` may also optionally be set to specify the se

`KEEP_VMSS` can also be optionally specified to have the test suite retain the bootstrapped VM(s) for further debugging. When this option is specified, the private SSH key used to connect to each VM will be included within each scenario's log bundle respectively.

**Note that when using `e2e-local.sh`, a timeout value of 45 minutes is applied to the `go test` command.**
**Note that when using `e2e-local.sh`, a timeout value of 90 minutes is applied to the `go test` command.**

You may also run the test command with custom arguments yourself (assuming you've properly setup the required environment variables) from within the `e2e/` directory like so:

```bash
go test -timeout 30m -v -run Test_All ./
go test -timeout 90m -v -run Test_All ./
```

## Package Structure
Expand Down
2 changes: 1 addition & 1 deletion e2e/e2e-local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ set -euxo pipefail
: "${SUBSCRIPTION_ID:=8ecadfc9-d1a3-4ea4-b844-0d9f87e4d7c8}" #Azure Container Service - Test Subscription
: "${LOCATION:=eastus}"
: "${AZURE_TENANT_ID:=72f988bf-86f1-41af-91ab-2d7cd011db47}"
: "${TIMEOUT:=45m}"
: "${TIMEOUT:=90m}"

export SUBSCRIPTION_ID
export LOCATION
Expand Down
35 changes: 27 additions & 8 deletions e2e/pollers.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"time"

"github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime"
"github.com/Azure/azure-sdk-for-go/sdk/azcore/to"
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice"
"github.com/Azure/go-autorest/autorest/azure"
corev1 "k8s.io/api/core/v1"
Expand All @@ -18,8 +19,9 @@ import (

const (
// Polling intervals
createVMSSPollingInterval = 15 * time.Second
vmssOperationPollInterval = 10 * time.Second
vmssClientCreateVMSSPollInterval = 15 * time.Second
deleteVMSSPollInterval = 10 * time.Second
defaultVMSSOperationPollInterval = 10 * time.Second
execOnVMPollInterval = 10 * time.Second
execOnPodPollInterval = 10 * time.Second
extractClusterParametersPollInterval = 10 * time.Second
Expand All @@ -28,17 +30,20 @@ const (
waitUntilPodRunningPollInterval = 5 * time.Second
waitUntilPodDeletedPollInterval = 5 * time.Second
waitUntilClusterNotCreatingPollInterval = 10 * time.Second
waitUntilNodeReadyPollingInterval = 5 * time.Second

// Polling timeouts
createVMSSPollingTimeout = 10 * time.Minute
vmssOperationPollingTimeout = 10 * time.Minute
vmssClientCreateVMSSPollingTimeout = 10 * time.Minute
deleteVMSSPollingTimeout = 5 * time.Minute
defaultVMSSOperationPollingTimeout = 10 * time.Minute
execOnVMPollingTimeout = 3 * time.Minute
execOnPodPollingTimeout = 2 * time.Minute
extractClusterParametersPollingTimeout = 3 * time.Minute
extractVMLogsPollingTimeout = 5 * time.Minute
getVMPrivateIPAddressPollingTimeout = 1 * time.Minute
waitUntilPodRunningPollingTimeout = 3 * time.Minute
waitUntilPodDeletedPollingTimeout = 1 * time.Minute
waitUntilNodeReadyPollingTimeout = 3 * time.Minute
)

func pollExecOnVM(ctx context.Context, kube *kubeclient, vmPrivateIP, jumpboxPodName string, sshPrivateKey, command string, isShellBuiltIn bool) (*podExecResult, error) {
Expand Down Expand Up @@ -191,7 +196,7 @@ func waitForClusterCreation(ctx context.Context, cloud *azureClient, resourceGro

func waitUntilNodeReady(ctx context.Context, kube *kubeclient, vmssName string) (string, error) {
var nodeName string
err := wait.PollImmediateWithContext(ctx, 5*time.Second, 5*time.Minute, func(ctx context.Context) (bool, error) {
err := wait.PollImmediateWithContext(ctx, waitUntilNodeReadyPollingInterval, waitUntilNodeReadyPollingTimeout, func(ctx context.Context) (bool, error) {
nodes, err := kube.typed.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
if err != nil {
return false, err
Expand Down Expand Up @@ -240,17 +245,31 @@ type Poller[T any] interface {
PollUntilDone(ctx context.Context, options *runtime.PollUntilDoneOptions) (T, error)
}

func pollVMSSOperation[T any](ctx context.Context, vmssName string, pollerOpts *runtime.PollUntilDoneOptions, vmssOperation func() (Poller[T], error)) (*T, error) {
type pollVMSSOperationOpts struct {
pollUntilDone *runtime.PollUntilDoneOptions
pollingInterval *time.Duration
pollingTimeout *time.Duration
}

// TODO: refactor into a new struct which manages the operation independently
func pollVMSSOperation[T any](ctx context.Context, vmssName string, opts pollVMSSOperationOpts, vmssOperation func() (Poller[T], error)) (*T, error) {
var vmssResp T
var requestError azure.RequestError

pollErr := wait.PollImmediateWithContext(ctx, vmssOperationPollInterval, vmssOperationPollingTimeout, func(ctx context.Context) (bool, error) {
if opts.pollingInterval == nil {
opts.pollingInterval = to.Ptr(defaultVMSSOperationPollInterval)
}
if opts.pollingTimeout == nil {
opts.pollingTimeout = to.Ptr(defaultVMSSOperationPollingTimeout)
}

pollErr := wait.PollImmediateWithContext(ctx, *opts.pollingInterval, *opts.pollingTimeout, func(ctx context.Context) (bool, error) {
poller, err := vmssOperation()
if err != nil {
log.Printf("error when creating the vmssOperation for VMSS %q: %v", vmssName, err)
return false, err
}
vmssResp, err = poller.PollUntilDone(ctx, pollerOpts)
vmssResp, err = poller.PollUntilDone(ctx, opts.pollUntilDone)
if err != nil {
if errors.As(err, &requestError) && requestError.ServiceError != nil {
/*
Expand Down
16 changes: 8 additions & 8 deletions e2e/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,14 +124,6 @@ func runScenario(ctx context.Context, t *testing.T, r *mrand.Rand, opts *scenari
log.Println("vm was unable to be provisioned due to a CSE error, will still atempt to extract provisioning logs...")
}

if vmssModel != nil {
if err := writeToFile(filepath.Join(opts.loggingDir, "vmssId.txt"), *vmssModel.ID); err != nil {
t.Fatalf("failed to write vmss resource ID to disk: %s", err)
}
} else {
log.Printf("WARNING: bootstrapped vmss model was nil for %s", vmssName)
}

if opts.suiteConfig.KeepVMSS {
defer func() {
log.Printf("vmss %q will be retained for debugging purposes, please make sure to manually delete it later", vmssName)
Expand All @@ -144,6 +136,14 @@ func runScenario(ctx context.Context, t *testing.T, r *mrand.Rand, opts *scenari
t.Fatalf("failed to write retained vmss %q private ssh key to disk: %s", vmssName, err)
}
}()
} else {
if vmssModel != nil {
if err := writeToFile(filepath.Join(opts.loggingDir, "vmssId.txt"), *vmssModel.ID); err != nil {
t.Fatalf("failed to write vmss resource ID to disk: %s", err)
}
} else {
log.Printf("WARNING: bootstrapped vmss model was nil for %s", vmssName)
}
}

vmPrivateIP, err := pollGetVMPrivateIP(ctx, vmssName, opts)
Expand Down
40 changes: 23 additions & 17 deletions e2e/vmss.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,19 +33,20 @@ func bootstrapVMSS(ctx context.Context, t *testing.T, r *mrand.Rand, vmssName st

cleanupVMSS := func() {
log.Printf("deleting vmss %q", vmssName)
vmssOperation := func() (Poller[armcompute.VirtualMachineScaleSetsClientDeleteResponse], error) {
if _, err := pollVMSSOperation(ctx, vmssName, pollVMSSOperationOpts{
pollingInterval: to.Ptr(deleteVMSSPollInterval),
pollingTimeout: to.Ptr(deleteVMSSPollingTimeout),
}, func() (Poller[armcompute.VirtualMachineScaleSetsClientDeleteResponse], error) {
return opts.cloud.vmssClient.BeginDelete(ctx, *opts.clusterConfig.cluster.Properties.NodeResourceGroup, vmssName, nil)
}
_, err := pollVMSSOperation(ctx, vmssName, nil, vmssOperation)
if err != nil {
t.Error("error polling deleting vmss", vmssName, err)
}); err != nil {
t.Errorf("encountered an error while waiting for deletion of vmss %q: %s", vmssName, err)
}
log.Printf("finished deleting vmss %q", vmssName)
}

vmssModel, err := createVMSSWithPayload(ctx, nodeBootstrapping.CustomData, nodeBootstrapping.CSE, vmssName, publicKeyBytes, opts)
if err != nil {
return nil, nil, fmt.Errorf("unable to create VMSS with payload: %w", err)
return nil, cleanupVMSS, fmt.Errorf("unable to create VMSS with payload: %w", err)
}

return vmssModel, cleanupVMSS, nil
Expand Down Expand Up @@ -76,22 +77,27 @@ func createVMSSWithPayload(ctx context.Context, customData, cseCmd, vmssName str
return nil, fmt.Errorf("unable to prepare model for VMSS %q: %w", vmssName, err)
}

createVMSSCtx, cancel := context.WithTimeout(ctx, createVMSSPollingTimeout)
createVMSSCtx, cancel := context.WithTimeout(ctx, vmssClientCreateVMSSPollingTimeout)
defer cancel()

vmssOperation := func() (Poller[armcompute.VirtualMachineScaleSetsClientCreateOrUpdateResponse], error) {
return opts.cloud.vmssClient.BeginCreateOrUpdate(
ctx,
*opts.clusterConfig.cluster.Properties.NodeResourceGroup,
vmssName,
model,
nil,
)
}
vmssResp, err := pollVMSSOperation(createVMSSCtx, vmssName, &runtime.PollUntilDoneOptions{Frequency: createVMSSPollingInterval}, vmssOperation)
vmssResp, err := pollVMSSOperation(createVMSSCtx, vmssName, pollVMSSOperationOpts{
pollUntilDone: &runtime.PollUntilDoneOptions{
Frequency: vmssClientCreateVMSSPollInterval,
},
},
func() (Poller[armcompute.VirtualMachineScaleSetsClientCreateOrUpdateResponse], error) {
return opts.cloud.vmssClient.BeginCreateOrUpdate(
ctx,
*opts.clusterConfig.cluster.Properties.NodeResourceGroup,
vmssName,
model,
nil,
)
})
if err != nil {
return nil, fmt.Errorf("unable to create VMSS %q: %w", vmssName, err)
}

return &vmssResp.VirtualMachineScaleSet, nil
}

Expand Down
Loading