From ca694ce69186daab0daa341473896d3907115e45 Mon Sep 17 00:00:00 2001 From: Cameron Meissner Date: Wed, 3 Jan 2024 10:21:00 -0800 Subject: [PATCH] fix: deletion of vmss when provisioning fails during e2es --- e2e/pollers.go | 33 ++++++++++++++++++++++++++------- e2e/vmss.go | 38 ++++++++++++++++++++++---------------- 2 files changed, 48 insertions(+), 23 deletions(-) diff --git a/e2e/pollers.go b/e2e/pollers.go index ab5e1979d57..c1f87f72677 100644 --- a/e2e/pollers.go +++ b/e2e/pollers.go @@ -9,6 +9,7 @@ import ( "time" "github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime" + "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice" "github.com/Azure/go-autorest/autorest/azure" corev1 "k8s.io/api/core/v1" @@ -18,8 +19,9 @@ import ( const ( // Polling intervals - createVMSSPollingInterval = 15 * time.Second - vmssOperationPollInterval = 10 * time.Second + vmssClientCreateVMSSPollInterval = 15 * time.Second + deleteVMSSPollInterval = 10 * time.Second + defaultVMSSOperationPollInterval = 10 * time.Second execOnVMPollInterval = 10 * time.Second execOnPodPollInterval = 10 * time.Second extractClusterParametersPollInterval = 10 * time.Second @@ -28,10 +30,12 @@ const ( waitUntilPodRunningPollInterval = 5 * time.Second waitUntilPodDeletedPollInterval = 5 * time.Second waitUntilClusterNotCreatingPollInterval = 10 * time.Second + waitUntilNodeReadyPollingInterval = 5 * time.Second // Polling timeouts createVMSSPollingTimeout = 10 * time.Minute - vmssOperationPollingTimeout = 10 * time.Minute + deleteVMSSPollingTimeout = 5 * time.Minute + defaultVMSSOperationPollingTimeout = 10 * time.Minute execOnVMPollingTimeout = 3 * time.Minute execOnPodPollingTimeout = 2 * time.Minute extractClusterParametersPollingTimeout = 3 * time.Minute @@ -39,6 +43,7 @@ const ( getVMPrivateIPAddressPollingTimeout = 1 * time.Minute waitUntilPodRunningPollingTimeout = 3 * time.Minute waitUntilPodDeletedPollingTimeout = 1 * time.Minute + waitUntilNodeReadyPollingTimeout = 3 * time.Minute ) func pollExecOnVM(ctx context.Context, kube *kubeclient, vmPrivateIP, jumpboxPodName string, sshPrivateKey, command string, isShellBuiltIn bool) (*podExecResult, error) { @@ -191,7 +196,7 @@ func waitForClusterCreation(ctx context.Context, cloud *azureClient, resourceGro func waitUntilNodeReady(ctx context.Context, kube *kubeclient, vmssName string) (string, error) { var nodeName string - err := wait.PollImmediateWithContext(ctx, 5*time.Second, 5*time.Minute, func(ctx context.Context) (bool, error) { + err := wait.PollImmediateWithContext(ctx, waitUntilNodeReadyPollingInterval, waitUntilNodeReadyPollingTimeout, func(ctx context.Context) (bool, error) { nodes, err := kube.typed.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) if err != nil { return false, err @@ -240,17 +245,31 @@ type Poller[T any] interface { PollUntilDone(ctx context.Context, options *runtime.PollUntilDoneOptions) (T, error) } -func pollVMSSOperation[T any](ctx context.Context, vmssName string, pollerOpts *runtime.PollUntilDoneOptions, vmssOperation func() (Poller[T], error)) (*T, error) { +type pollVMSSOperationOpts struct { + pollUntilDone *runtime.PollUntilDoneOptions + pollingInterval *time.Duration + pollingTimeout *time.Duration +} + +// TODO: refactor into a new struct which manages the operation independently +func pollVMSSOperation[T any](ctx context.Context, vmssName string, opts pollVMSSOperationOpts, vmssOperation func() (Poller[T], error)) (*T, error) { var vmssResp T var requestError azure.RequestError - pollErr := wait.PollImmediateWithContext(ctx, vmssOperationPollInterval, vmssOperationPollingTimeout, func(ctx context.Context) (bool, error) { + if opts.pollingInterval == nil { + opts.pollingInterval = to.Ptr(defaultVMSSOperationPollInterval) + } + if opts.pollingTimeout == nil { + opts.pollingTimeout = to.Ptr(defaultVMSSOperationPollingTimeout) + } + + pollErr := wait.PollImmediateWithContext(ctx, *opts.pollingInterval, *opts.pollingTimeout, func(ctx context.Context) (bool, error) { poller, err := vmssOperation() if err != nil { log.Printf("error when creating the vmssOperation for VMSS %q: %v", vmssName, err) return false, err } - vmssResp, err = poller.PollUntilDone(ctx, pollerOpts) + vmssResp, err = poller.PollUntilDone(ctx, opts.pollUntilDone) if err != nil { if errors.As(err, &requestError) && requestError.ServiceError != nil { /* diff --git a/e2e/vmss.go b/e2e/vmss.go index bf0788d3080..84e5ac1f6ec 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -33,19 +33,20 @@ func bootstrapVMSS(ctx context.Context, t *testing.T, r *mrand.Rand, vmssName st cleanupVMSS := func() { log.Printf("deleting vmss %q", vmssName) - vmssOperation := func() (Poller[armcompute.VirtualMachineScaleSetsClientDeleteResponse], error) { + if _, err := pollVMSSOperation(ctx, vmssName, pollVMSSOperationOpts{ + pollingInterval: to.Ptr(deleteVMSSPollInterval), + pollingTimeout: to.Ptr(deleteVMSSPollingTimeout), + }, func() (Poller[armcompute.VirtualMachineScaleSetsClientDeleteResponse], error) { return opts.cloud.vmssClient.BeginDelete(ctx, *opts.clusterConfig.cluster.Properties.NodeResourceGroup, vmssName, nil) - } - _, err := pollVMSSOperation(ctx, vmssName, nil, vmssOperation) - if err != nil { - t.Error("error polling deleting vmss", vmssName, err) + }); err != nil { + t.Errorf("encountered an error while waiting for deletion of vmss %q: %s", vmssName, err) } log.Printf("finished deleting vmss %q", vmssName) } vmssModel, err := createVMSSWithPayload(ctx, nodeBootstrapping.CustomData, nodeBootstrapping.CSE, vmssName, publicKeyBytes, opts) if err != nil { - return nil, nil, fmt.Errorf("unable to create VMSS with payload: %w", err) + return nil, cleanupVMSS, fmt.Errorf("unable to create VMSS with payload: %w", err) } return vmssModel, cleanupVMSS, nil @@ -79,19 +80,24 @@ func createVMSSWithPayload(ctx context.Context, customData, cseCmd, vmssName str createVMSSCtx, cancel := context.WithTimeout(ctx, createVMSSPollingTimeout) defer cancel() - vmssOperation := func() (Poller[armcompute.VirtualMachineScaleSetsClientCreateOrUpdateResponse], error) { - return opts.cloud.vmssClient.BeginCreateOrUpdate( - ctx, - *opts.clusterConfig.cluster.Properties.NodeResourceGroup, - vmssName, - model, - nil, - ) - } - vmssResp, err := pollVMSSOperation(createVMSSCtx, vmssName, &runtime.PollUntilDoneOptions{Frequency: createVMSSPollingInterval}, vmssOperation) + vmssResp, err := pollVMSSOperation(createVMSSCtx, vmssName, pollVMSSOperationOpts{ + pollUntilDone: &runtime.PollUntilDoneOptions{ + Frequency: vmssClientCreateVMSSPollInterval, + }, + }, + func() (Poller[armcompute.VirtualMachineScaleSetsClientCreateOrUpdateResponse], error) { + return opts.cloud.vmssClient.BeginCreateOrUpdate( + ctx, + *opts.clusterConfig.cluster.Properties.NodeResourceGroup, + vmssName, + model, + nil, + ) + }) if err != nil { return nil, fmt.Errorf("unable to create VMSS %q: %w", vmssName, err) } + return &vmssResp.VirtualMachineScaleSet, nil }