Skip to content

Commit

Permalink
fix: deletion of vmss when provisioning fails during e2es
Browse files Browse the repository at this point in the history
  • Loading branch information
Cameron Meissner committed Jan 3, 2024
1 parent d3f05bd commit ca694ce
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 23 deletions.
33 changes: 26 additions & 7 deletions e2e/pollers.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"time"

"github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime"
"github.com/Azure/azure-sdk-for-go/sdk/azcore/to"
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice"
"github.com/Azure/go-autorest/autorest/azure"
corev1 "k8s.io/api/core/v1"
Expand All @@ -18,8 +19,9 @@ import (

const (
// Polling intervals
createVMSSPollingInterval = 15 * time.Second
vmssOperationPollInterval = 10 * time.Second
vmssClientCreateVMSSPollInterval = 15 * time.Second
deleteVMSSPollInterval = 10 * time.Second
defaultVMSSOperationPollInterval = 10 * time.Second
execOnVMPollInterval = 10 * time.Second
execOnPodPollInterval = 10 * time.Second
extractClusterParametersPollInterval = 10 * time.Second
Expand All @@ -28,17 +30,20 @@ const (
waitUntilPodRunningPollInterval = 5 * time.Second
waitUntilPodDeletedPollInterval = 5 * time.Second
waitUntilClusterNotCreatingPollInterval = 10 * time.Second
waitUntilNodeReadyPollingInterval = 5 * time.Second

// Polling timeouts
createVMSSPollingTimeout = 10 * time.Minute
vmssOperationPollingTimeout = 10 * time.Minute
deleteVMSSPollingTimeout = 5 * time.Minute
defaultVMSSOperationPollingTimeout = 10 * time.Minute
execOnVMPollingTimeout = 3 * time.Minute
execOnPodPollingTimeout = 2 * time.Minute
extractClusterParametersPollingTimeout = 3 * time.Minute
extractVMLogsPollingTimeout = 5 * time.Minute
getVMPrivateIPAddressPollingTimeout = 1 * time.Minute
waitUntilPodRunningPollingTimeout = 3 * time.Minute
waitUntilPodDeletedPollingTimeout = 1 * time.Minute
waitUntilNodeReadyPollingTimeout = 3 * time.Minute
)

func pollExecOnVM(ctx context.Context, kube *kubeclient, vmPrivateIP, jumpboxPodName string, sshPrivateKey, command string, isShellBuiltIn bool) (*podExecResult, error) {
Expand Down Expand Up @@ -191,7 +196,7 @@ func waitForClusterCreation(ctx context.Context, cloud *azureClient, resourceGro

func waitUntilNodeReady(ctx context.Context, kube *kubeclient, vmssName string) (string, error) {
var nodeName string
err := wait.PollImmediateWithContext(ctx, 5*time.Second, 5*time.Minute, func(ctx context.Context) (bool, error) {
err := wait.PollImmediateWithContext(ctx, waitUntilNodeReadyPollingInterval, waitUntilNodeReadyPollingTimeout, func(ctx context.Context) (bool, error) {
nodes, err := kube.typed.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
if err != nil {
return false, err
Expand Down Expand Up @@ -240,17 +245,31 @@ type Poller[T any] interface {
PollUntilDone(ctx context.Context, options *runtime.PollUntilDoneOptions) (T, error)
}

func pollVMSSOperation[T any](ctx context.Context, vmssName string, pollerOpts *runtime.PollUntilDoneOptions, vmssOperation func() (Poller[T], error)) (*T, error) {
type pollVMSSOperationOpts struct {
pollUntilDone *runtime.PollUntilDoneOptions
pollingInterval *time.Duration
pollingTimeout *time.Duration
}

// TODO: refactor into a new struct which manages the operation independently
func pollVMSSOperation[T any](ctx context.Context, vmssName string, opts pollVMSSOperationOpts, vmssOperation func() (Poller[T], error)) (*T, error) {
var vmssResp T
var requestError azure.RequestError

pollErr := wait.PollImmediateWithContext(ctx, vmssOperationPollInterval, vmssOperationPollingTimeout, func(ctx context.Context) (bool, error) {
if opts.pollingInterval == nil {
opts.pollingInterval = to.Ptr(defaultVMSSOperationPollInterval)
}
if opts.pollingTimeout == nil {
opts.pollingTimeout = to.Ptr(defaultVMSSOperationPollingTimeout)
}

pollErr := wait.PollImmediateWithContext(ctx, *opts.pollingInterval, *opts.pollingTimeout, func(ctx context.Context) (bool, error) {
poller, err := vmssOperation()
if err != nil {
log.Printf("error when creating the vmssOperation for VMSS %q: %v", vmssName, err)
return false, err
}
vmssResp, err = poller.PollUntilDone(ctx, pollerOpts)
vmssResp, err = poller.PollUntilDone(ctx, opts.pollUntilDone)
if err != nil {
if errors.As(err, &requestError) && requestError.ServiceError != nil {
/*
Expand Down
38 changes: 22 additions & 16 deletions e2e/vmss.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,19 +33,20 @@ func bootstrapVMSS(ctx context.Context, t *testing.T, r *mrand.Rand, vmssName st

cleanupVMSS := func() {
log.Printf("deleting vmss %q", vmssName)
vmssOperation := func() (Poller[armcompute.VirtualMachineScaleSetsClientDeleteResponse], error) {
if _, err := pollVMSSOperation(ctx, vmssName, pollVMSSOperationOpts{
pollingInterval: to.Ptr(deleteVMSSPollInterval),
pollingTimeout: to.Ptr(deleteVMSSPollingTimeout),
}, func() (Poller[armcompute.VirtualMachineScaleSetsClientDeleteResponse], error) {
return opts.cloud.vmssClient.BeginDelete(ctx, *opts.clusterConfig.cluster.Properties.NodeResourceGroup, vmssName, nil)
}
_, err := pollVMSSOperation(ctx, vmssName, nil, vmssOperation)
if err != nil {
t.Error("error polling deleting vmss", vmssName, err)
}); err != nil {
t.Errorf("encountered an error while waiting for deletion of vmss %q: %s", vmssName, err)
}
log.Printf("finished deleting vmss %q", vmssName)
}

vmssModel, err := createVMSSWithPayload(ctx, nodeBootstrapping.CustomData, nodeBootstrapping.CSE, vmssName, publicKeyBytes, opts)
if err != nil {
return nil, nil, fmt.Errorf("unable to create VMSS with payload: %w", err)
return nil, cleanupVMSS, fmt.Errorf("unable to create VMSS with payload: %w", err)
}

return vmssModel, cleanupVMSS, nil
Expand Down Expand Up @@ -79,19 +80,24 @@ func createVMSSWithPayload(ctx context.Context, customData, cseCmd, vmssName str
createVMSSCtx, cancel := context.WithTimeout(ctx, createVMSSPollingTimeout)
defer cancel()

vmssOperation := func() (Poller[armcompute.VirtualMachineScaleSetsClientCreateOrUpdateResponse], error) {
return opts.cloud.vmssClient.BeginCreateOrUpdate(
ctx,
*opts.clusterConfig.cluster.Properties.NodeResourceGroup,
vmssName,
model,
nil,
)
}
vmssResp, err := pollVMSSOperation(createVMSSCtx, vmssName, &runtime.PollUntilDoneOptions{Frequency: createVMSSPollingInterval}, vmssOperation)
vmssResp, err := pollVMSSOperation(createVMSSCtx, vmssName, pollVMSSOperationOpts{
pollUntilDone: &runtime.PollUntilDoneOptions{
Frequency: vmssClientCreateVMSSPollInterval,
},
},
func() (Poller[armcompute.VirtualMachineScaleSetsClientCreateOrUpdateResponse], error) {
return opts.cloud.vmssClient.BeginCreateOrUpdate(
ctx,
*opts.clusterConfig.cluster.Properties.NodeResourceGroup,
vmssName,
model,
nil,
)
})
if err != nil {
return nil, fmt.Errorf("unable to create VMSS %q: %w", vmssName, err)
}

return &vmssResp.VirtualMachineScaleSet, nil
}

Expand Down

0 comments on commit ca694ce

Please sign in to comment.