From ca694ce69186daab0daa341473896d3907115e45 Mon Sep 17 00:00:00 2001
From: Cameron Meissner <cameissner@microsoft.com>
Date: Wed, 3 Jan 2024 10:21:00 -0800
Subject: [PATCH] fix: deletion of vmss when provisioning fails during e2es

---
 e2e/pollers.go | 33 ++++++++++++++++++++++++++-------
 e2e/vmss.go    | 38 ++++++++++++++++++++++----------------
 2 files changed, 48 insertions(+), 23 deletions(-)

diff --git a/e2e/pollers.go b/e2e/pollers.go
index ab5e1979d57..c1f87f72677 100644
--- a/e2e/pollers.go
+++ b/e2e/pollers.go
@@ -9,6 +9,7 @@ import (
 	"time"
 
 	"github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime"
+	"github.com/Azure/azure-sdk-for-go/sdk/azcore/to"
 	"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice"
 	"github.com/Azure/go-autorest/autorest/azure"
 	corev1 "k8s.io/api/core/v1"
@@ -18,8 +19,9 @@ import (
 
 const (
 	// Polling intervals
-	createVMSSPollingInterval               = 15 * time.Second
-	vmssOperationPollInterval               = 10 * time.Second
+	vmssClientCreateVMSSPollInterval        = 15 * time.Second
+	deleteVMSSPollInterval                  = 10 * time.Second
+	defaultVMSSOperationPollInterval        = 10 * time.Second
 	execOnVMPollInterval                    = 10 * time.Second
 	execOnPodPollInterval                   = 10 * time.Second
 	extractClusterParametersPollInterval    = 10 * time.Second
@@ -28,10 +30,12 @@ const (
 	waitUntilPodRunningPollInterval         = 5 * time.Second
 	waitUntilPodDeletedPollInterval         = 5 * time.Second
 	waitUntilClusterNotCreatingPollInterval = 10 * time.Second
+	waitUntilNodeReadyPollingInterval       = 5 * time.Second
 
 	// Polling timeouts
 	createVMSSPollingTimeout               = 10 * time.Minute
-	vmssOperationPollingTimeout            = 10 * time.Minute
+	deleteVMSSPollingTimeout               = 5 * time.Minute
+	defaultVMSSOperationPollingTimeout     = 10 * time.Minute
 	execOnVMPollingTimeout                 = 3 * time.Minute
 	execOnPodPollingTimeout                = 2 * time.Minute
 	extractClusterParametersPollingTimeout = 3 * time.Minute
@@ -39,6 +43,7 @@ const (
 	getVMPrivateIPAddressPollingTimeout    = 1 * time.Minute
 	waitUntilPodRunningPollingTimeout      = 3 * time.Minute
 	waitUntilPodDeletedPollingTimeout      = 1 * time.Minute
+	waitUntilNodeReadyPollingTimeout       = 3 * time.Minute
 )
 
 func pollExecOnVM(ctx context.Context, kube *kubeclient, vmPrivateIP, jumpboxPodName string, sshPrivateKey, command string, isShellBuiltIn bool) (*podExecResult, error) {
@@ -191,7 +196,7 @@ func waitForClusterCreation(ctx context.Context, cloud *azureClient, resourceGro
 
 func waitUntilNodeReady(ctx context.Context, kube *kubeclient, vmssName string) (string, error) {
 	var nodeName string
-	err := wait.PollImmediateWithContext(ctx, 5*time.Second, 5*time.Minute, func(ctx context.Context) (bool, error) {
+	err := wait.PollImmediateWithContext(ctx, waitUntilNodeReadyPollingInterval, waitUntilNodeReadyPollingTimeout, func(ctx context.Context) (bool, error) {
 		nodes, err := kube.typed.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
 		if err != nil {
 			return false, err
@@ -240,17 +245,31 @@ type Poller[T any] interface {
 	PollUntilDone(ctx context.Context, options *runtime.PollUntilDoneOptions) (T, error)
 }
 
-func pollVMSSOperation[T any](ctx context.Context, vmssName string, pollerOpts *runtime.PollUntilDoneOptions, vmssOperation func() (Poller[T], error)) (*T, error) {
+type pollVMSSOperationOpts struct {
+	pollUntilDone   *runtime.PollUntilDoneOptions
+	pollingInterval *time.Duration
+	pollingTimeout  *time.Duration
+}
+
+// TODO: refactor into a new struct which manages the operation independently
+func pollVMSSOperation[T any](ctx context.Context, vmssName string, opts pollVMSSOperationOpts, vmssOperation func() (Poller[T], error)) (*T, error) {
 	var vmssResp T
 	var requestError azure.RequestError
 
-	pollErr := wait.PollImmediateWithContext(ctx, vmssOperationPollInterval, vmssOperationPollingTimeout, func(ctx context.Context) (bool, error) {
+	if opts.pollingInterval == nil {
+		opts.pollingInterval = to.Ptr(defaultVMSSOperationPollInterval)
+	}
+	if opts.pollingTimeout == nil {
+		opts.pollingTimeout = to.Ptr(defaultVMSSOperationPollingTimeout)
+	}
+
+	pollErr := wait.PollImmediateWithContext(ctx, *opts.pollingInterval, *opts.pollingTimeout, func(ctx context.Context) (bool, error) {
 		poller, err := vmssOperation()
 		if err != nil {
 			log.Printf("error when creating the vmssOperation for VMSS %q: %v", vmssName, err)
 			return false, err
 		}
-		vmssResp, err = poller.PollUntilDone(ctx, pollerOpts)
+		vmssResp, err = poller.PollUntilDone(ctx, opts.pollUntilDone)
 		if err != nil {
 			if errors.As(err, &requestError) && requestError.ServiceError != nil {
 				/*
diff --git a/e2e/vmss.go b/e2e/vmss.go
index bf0788d3080..84e5ac1f6ec 100644
--- a/e2e/vmss.go
+++ b/e2e/vmss.go
@@ -33,19 +33,20 @@ func bootstrapVMSS(ctx context.Context, t *testing.T, r *mrand.Rand, vmssName st
 
 	cleanupVMSS := func() {
 		log.Printf("deleting vmss %q", vmssName)
-		vmssOperation := func() (Poller[armcompute.VirtualMachineScaleSetsClientDeleteResponse], error) {
+		if _, err := pollVMSSOperation(ctx, vmssName, pollVMSSOperationOpts{
+			pollingInterval: to.Ptr(deleteVMSSPollInterval),
+			pollingTimeout:  to.Ptr(deleteVMSSPollingTimeout),
+		}, func() (Poller[armcompute.VirtualMachineScaleSetsClientDeleteResponse], error) {
 			return opts.cloud.vmssClient.BeginDelete(ctx, *opts.clusterConfig.cluster.Properties.NodeResourceGroup, vmssName, nil)
-		}
-		_, err := pollVMSSOperation(ctx, vmssName, nil, vmssOperation)
-		if err != nil {
-			t.Error("error polling deleting vmss", vmssName, err)
+		}); err != nil {
+			t.Errorf("encountered an error while waiting for deletion of vmss %q: %s", vmssName, err)
 		}
 		log.Printf("finished deleting vmss %q", vmssName)
 	}
 
 	vmssModel, err := createVMSSWithPayload(ctx, nodeBootstrapping.CustomData, nodeBootstrapping.CSE, vmssName, publicKeyBytes, opts)
 	if err != nil {
-		return nil, nil, fmt.Errorf("unable to create VMSS with payload: %w", err)
+		return nil, cleanupVMSS, fmt.Errorf("unable to create VMSS with payload: %w", err)
 	}
 
 	return vmssModel, cleanupVMSS, nil
@@ -79,19 +80,24 @@ func createVMSSWithPayload(ctx context.Context, customData, cseCmd, vmssName str
 	createVMSSCtx, cancel := context.WithTimeout(ctx, createVMSSPollingTimeout)
 	defer cancel()
 
-	vmssOperation := func() (Poller[armcompute.VirtualMachineScaleSetsClientCreateOrUpdateResponse], error) {
-		return opts.cloud.vmssClient.BeginCreateOrUpdate(
-			ctx,
-			*opts.clusterConfig.cluster.Properties.NodeResourceGroup,
-			vmssName,
-			model,
-			nil,
-		)
-	}
-	vmssResp, err := pollVMSSOperation(createVMSSCtx, vmssName, &runtime.PollUntilDoneOptions{Frequency: createVMSSPollingInterval}, vmssOperation)
+	vmssResp, err := pollVMSSOperation(createVMSSCtx, vmssName, pollVMSSOperationOpts{
+		pollUntilDone: &runtime.PollUntilDoneOptions{
+			Frequency: vmssClientCreateVMSSPollInterval,
+		},
+	},
+		func() (Poller[armcompute.VirtualMachineScaleSetsClientCreateOrUpdateResponse], error) {
+			return opts.cloud.vmssClient.BeginCreateOrUpdate(
+				ctx,
+				*opts.clusterConfig.cluster.Properties.NodeResourceGroup,
+				vmssName,
+				model,
+				nil,
+			)
+		})
 	if err != nil {
 		return nil, fmt.Errorf("unable to create VMSS %q: %w", vmssName, err)
 	}
+
 	return &vmssResp.VirtualMachineScaleSet, nil
 }