From f9f94988755400e19c424b52edc10fc175eda46e Mon Sep 17 00:00:00 2001 From: Bryce Soghigian <49734722+Bryce-Soghigian@users.noreply.github.com> Date: Wed, 10 Jan 2024 14:28:28 -0800 Subject: [PATCH] fix: removing nic after vm creation fails (#68) * fix: removing nic after vm creation fails * test(fakes): adding network interface api BeginDelete functionality * test: adding tests validating network interface is deleted on provisioning failure * feat: removing all resources on launchInstance() failure, and minor refactor of vmName, nicName to be one resourceName * style: gofmt for cleanupResources function * test: retry policy * feat: deleting nics on CloudProvider.Delete() * test: updating network interface api to return proper not found errors * fix: removing nic per call retry policy * fix: better naming on error tests * style: adding some comments to explain rationale behind deletion ordering --------- Co-authored-by: Rachel Gregory --- pkg/fake/networkinterfaceapi.go | 21 ++++++-- pkg/providers/instance/azure_client.go | 1 + pkg/providers/instance/instance.go | 62 +++++++++++++----------- pkg/providers/instancetype/suite_test.go | 48 +++++++++++++++++- pkg/utils/opts/armopts.go | 7 +++ 5 files changed, 105 insertions(+), 34 deletions(-) diff --git a/pkg/fake/networkinterfaceapi.go b/pkg/fake/networkinterfaceapi.go index 3847f7bb2..4537b0beb 100644 --- a/pkg/fake/networkinterfaceapi.go +++ b/pkg/fake/networkinterfaceapi.go @@ -21,6 +21,8 @@ import ( "fmt" "sync" + "github.com/Azure/azure-sdk-for-go-extensions/pkg/errors" + "github.com/Azure/azure-sdk-for-go/sdk/azcore" "github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork" "github.com/Azure/go-autorest/autorest/to" @@ -34,8 +36,13 @@ type NetworkInterfaceCreateOrUpdateInput struct { Options *armnetwork.InterfacesClientBeginCreateOrUpdateOptions } +type NetworkInterfaceDeleteInput struct { + ResourceGroupName, InterfaceName string +} + type NetworkInterfacesBehavior struct { NetworkInterfacesCreateOrUpdateBehavior MockedLRO[NetworkInterfaceCreateOrUpdateInput, armnetwork.InterfacesClientCreateOrUpdateResponse] + NetworkInterfacesDeleteBehavior MockedLRO[NetworkInterfaceDeleteInput, armnetwork.InterfacesClientDeleteResponse] NetworkInterfaces sync.Map } @@ -79,7 +86,7 @@ func (c *NetworkInterfacesAPI) Get(_ context.Context, resourceGroupName string, id := mkNetworkInterfaceID(resourceGroupName, interfaceName) iface, ok := c.NetworkInterfaces.Load(id) if !ok { - return armnetwork.InterfacesClientGetResponse{}, fmt.Errorf("not found") + return armnetwork.InterfacesClientGetResponse{}, &azcore.ResponseError{ErrorCode: errors.ResourceNotFound} } return armnetwork.InterfacesClientGetResponse{ Interface: iface.(armnetwork.Interface), @@ -87,9 +94,15 @@ func (c *NetworkInterfacesAPI) Get(_ context.Context, resourceGroupName string, } func (c *NetworkInterfacesAPI) BeginDelete(_ context.Context, resourceGroupName string, interfaceName string, _ *armnetwork.InterfacesClientBeginDeleteOptions) (*runtime.Poller[armnetwork.InterfacesClientDeleteResponse], error) { - id := mkNetworkInterfaceID(resourceGroupName, interfaceName) - c.NetworkInterfaces.Delete(id) - return nil, nil + input := &NetworkInterfaceDeleteInput{ + ResourceGroupName: resourceGroupName, + InterfaceName: interfaceName, + } + return c.NetworkInterfacesDeleteBehavior.Invoke(input, func(input *NetworkInterfaceDeleteInput) (*armnetwork.InterfacesClientDeleteResponse, error) { + id := mkNetworkInterfaceID(resourceGroupName, interfaceName) + c.NetworkInterfaces.Delete(id) + return &armnetwork.InterfacesClientDeleteResponse{}, nil + }) } func mkNetworkInterfaceID(resourceGroupName, interfaceName string) string { diff --git a/pkg/providers/instance/azure_client.go b/pkg/providers/instance/azure_client.go index 08c7ee9ce..0fb7a6a0b 100644 --- a/pkg/providers/instance/azure_client.go +++ b/pkg/providers/instance/azure_client.go @@ -136,6 +136,7 @@ func NewAZClient(ctx context.Context, cfg *auth.Config, env *azure.Environment) if err != nil { return nil, err } + interfacesClient, err := armnetwork.NewInterfacesClient(cfg.SubscriptionID, cred, opts) if err != nil { return nil, err diff --git a/pkg/providers/instance/instance.go b/pkg/providers/instance/instance.go index e0465c92d..aae8885e9 100644 --- a/pkg/providers/instance/instance.go +++ b/pkg/providers/instance/instance.go @@ -19,6 +19,7 @@ package instance import ( "context" "encoding/json" + "errors" "fmt" "math" "sort" @@ -123,6 +124,9 @@ func (p *Provider) Create(ctx context.Context, nodeClass *v1alpha2.AKSNodeClass, instanceTypes = orderInstanceTypesByPrice(instanceTypes, scheduling.NewNodeSelectorRequirements(nodeClaim.Spec.Requirements...)) vm, instanceType, err := p.launchInstance(ctx, nodeClass, nodeClaim, instanceTypes) if err != nil { + if cleanupErr := p.cleanupAzureResources(ctx, GenerateResourceName(nodeClaim.Name)); cleanupErr != nil { + logging.FromContext(ctx).Errorf("failed to cleanup resources for node claim %s, %w", nodeClaim.Name, cleanupErr) + } return nil, err } zone, err := GetZoneID(vm) @@ -175,27 +179,19 @@ func (p *Provider) List(ctx context.Context) ([]*armcompute.VirtualMachine, erro return vmList, nil } -func (p *Provider) Delete(ctx context.Context, vmName string) error { - logging.FromContext(ctx).Debugf("Deleting virtual machine %s", vmName) - return deleteVirtualMachine(ctx, p.azClient.virtualMachinesClient, p.resourceGroup, vmName) +func (p *Provider) Delete(ctx context.Context, resourceName string) error { + logging.FromContext(ctx).Debugf("Deleting virtual machine %s and associated resources") + return p.cleanupAzureResources(ctx, resourceName) } // createAKSIdentifyingExtension attaches a VM extension to identify that this VM participates in an AKS cluster -func (p *Provider) createAKSIdentifyingExtension( - ctx context.Context, - vmName, _ string, -) error { - var err error +func (p *Provider) createAKSIdentifyingExtension(ctx context.Context, vmName string) (err error) { vmExt := p.getAKSIdentifyingExtension() vmExtName := *vmExt.Name logging.FromContext(ctx).Debugf("Creating virtual machine AKS identifying extension for %s", vmName) v, err := createVirtualMachineExtension(ctx, p.azClient.virtualMachinesExtensionClient, p.resourceGroup, vmName, vmExtName, *vmExt) if err != nil { logging.FromContext(ctx).Errorf("Creating VM AKS identifying extension for VM %q failed, %w", vmName, err) - vmErr := deleteVirtualMachine(ctx, p.azClient.virtualMachinesClient, p.resourceGroup, vmName) - if vmErr != nil { - logging.FromContext(ctx).Errorf("virtualMachine.Delete for %s failed: %v", vmName, vmErr) - } return fmt.Errorf("creating VM AKS identifying extension for VM %q, %w failed", vmName, err) } logging.FromContext(ctx).Debugf("Created virtual machine AKS identifying extension for %s, with an id of %s", vmName, *v.ID) @@ -240,7 +236,7 @@ func (p *Provider) newNetworkInterfaceForVM(vmName string, backendPools *loadbal } } -func GenerateVMName(nodeClaimName string) string { +func GenerateResourceName(nodeClaimName string) string { return fmt.Sprintf("aks-%s", nodeClaimName) } @@ -255,9 +251,6 @@ func (p *Provider) createNetworkInterface(ctx context.Context, nicName string, l logging.FromContext(ctx).Debugf("Creating network interface %s", nicName) res, err := createNic(ctx, p.azClient.networkInterfacesClient, p.resourceGroup, nicName, nic) if err != nil { - if nicErr := deleteNicIfExists(ctx, p.azClient.networkInterfacesClient, p.resourceGroup, nicName); nicErr != nil { - logging.FromContext(ctx).Errorf("networkInterface.Delete for %s failed: %v", nicName, nicErr) - } return "", err } logging.FromContext(ctx).Debugf("Successfully created network interface: %v", *res.ID) @@ -374,14 +367,10 @@ func setVMTagsProvisionerName(tags map[string]*string, nodeClaim *corev1beta1.No } } -func (p *Provider) createVirtualMachine(ctx context.Context, vm armcompute.VirtualMachine, vmName, _ string) (*armcompute.VirtualMachine, error) { +func (p *Provider) createVirtualMachine(ctx context.Context, vm armcompute.VirtualMachine, vmName string) (*armcompute.VirtualMachine, error) { result, err := CreateVirtualMachine(ctx, p.azClient.virtualMachinesClient, p.resourceGroup, vmName, vm) if err != nil { logging.FromContext(ctx).Errorf("Creating virtual machine %q failed: %v", vmName, err) - vmErr := deleteVirtualMachineIfExists(ctx, p.azClient.virtualMachinesClient, p.resourceGroup, vmName) - if vmErr != nil { - logging.FromContext(ctx).Errorf("virtualMachine.Delete for %s failed: %v", vmName, vmErr) - } return nil, fmt.Errorf("virtualMachine.BeginCreateOrUpdate for VM %q failed: %w", vmName, err) } logging.FromContext(ctx).Debugf("Created virtual machine %s", *result.ID) @@ -398,29 +387,28 @@ func (p *Provider) launchInstance( if err != nil { return nil, nil, fmt.Errorf("getting launch template: %w", err) } - - vmName := GenerateVMName(nodeClaim.Name) + // resourceName for the NIC, VM, and Disk + resourceName := GenerateResourceName(nodeClaim.Name) // create network interface - nicName := vmName - nicReference, err := p.createNetworkInterface(ctx, vmName, launchTemplate, instanceType) + nicReference, err := p.createNetworkInterface(ctx, resourceName, launchTemplate, instanceType) if err != nil { return nil, nil, err } sshPublicKey := settings.FromContext(ctx).SSHPublicKey nodeIdentityIDs := settings.FromContext(ctx).NodeIdentities - vm := newVMObject(vmName, nicReference, zone, capacityType, p.location, sshPublicKey, nodeIdentityIDs, nodeClass, nodeClaim, launchTemplate, instanceType) + vm := newVMObject(resourceName, nicReference, zone, capacityType, p.location, sshPublicKey, nodeIdentityIDs, nodeClass, nodeClaim, launchTemplate, instanceType) - logging.FromContext(ctx).Debugf("Creating virtual machine %s (%s)", vmName, instanceType.Name) + logging.FromContext(ctx).Debugf("Creating virtual machine %s (%s)", resourceName, instanceType.Name) // Uses AZ Client to create a new virtual machine using the vm object we prepared earlier - resp, err := p.createVirtualMachine(ctx, vm, vmName, nicName) + resp, err := p.createVirtualMachine(ctx, vm, resourceName) if err != nil { azErr := p.handleResponseErrors(ctx, instanceType, zone, capacityType, err) return nil, nil, azErr } - err = p.createAKSIdentifyingExtension(ctx, vmName, nicName) + err = p.createAKSIdentifyingExtension(ctx, resourceName) if err != nil { return nil, nil, err } @@ -561,6 +549,22 @@ func (p *Provider) pickSkuSizePriorityAndZone(ctx context.Context, nodeClaim *co return nil, "", "" } +func (p *Provider) cleanupAzureResources(ctx context.Context, resourceName string) (err error) { + vmErr := deleteVirtualMachineIfExists(ctx, p.azClient.virtualMachinesClient, p.resourceGroup, resourceName) + if vmErr != nil { + logging.FromContext(ctx).Errorf("virtualMachine.Delete for %s failed: %v", resourceName, vmErr) + } + // The order here is intentional, if the VM was created successfully, then we attempt to delete the vm, the + // nic, disk and all associated resources will be removed. If the VM was not created successfully and a nic was found, + // then we attempt to delete the nic. + nicErr := deleteNicIfExists(ctx, p.azClient.networkInterfacesClient, p.resourceGroup, resourceName) + if nicErr != nil { + logging.FromContext(ctx).Errorf("networkInterface.Delete for %s failed: %v", resourceName, nicErr) + } + + return errors.Join(vmErr, nicErr) +} + // getPriorityForInstanceType selects spot if both constraints are flexible and there is an available offering. // The Azure Cloud Provider defaults to Regular, so spot must be explicitly included in capacity type requirements. // diff --git a/pkg/providers/instancetype/suite_test.go b/pkg/providers/instancetype/suite_test.go index 4a3739fec..2c559794a 100644 --- a/pkg/providers/instancetype/suite_test.go +++ b/pkg/providers/instancetype/suite_test.go @@ -145,7 +145,36 @@ var _ = Describe("InstanceType Provider", func() { ExpectCleanedUp(ctx, env.Client) }) - Context("subscription level quota error responses", func() { + Context("vm creation error responses", func() { + It("should delete the network interface on failure to create the vm", func() { + ErrMsg := "test error" + ErrCode := fmt.Sprint(http.StatusNotFound) + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + azureEnv.VirtualMachinesAPI.VirtualMachinesBehavior.VirtualMachineCreateOrUpdateBehavior.Error.Set( + &azcore.ResponseError{ + ErrorCode: ErrCode, + RawResponse: &http.Response{ + Body: createSDKErrorBody(ErrCode, ErrMsg), + }, + }, + ) + pod := coretest.UnschedulablePod() + ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod) + ExpectNotScheduled(ctx, env.Client, pod) + + // We should have created a nic for the vm + Expect(azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + // The nic we used in the vm create, should be cleaned up if the vm call fails + nic := azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Pop() + Expect(nic).NotTo(BeNil()) + _, ok := azureEnv.NetworkInterfacesAPI.NetworkInterfaces.Load(nic.Interface.ID) + Expect(ok).To(Equal(false)) + + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set(nil) + pod = coretest.UnschedulablePod() + ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod) + ExpectScheduled(ctx, env.Client, pod) + }) It("should fail to provision when VM SKU family vCPU quota exceeded error is returned, and succeed when it is gone", func() { familyVCPUQuotaExceededErrorMessage := "Operation could not be completed as it results in exceeding approved standardDLSv5Family Cores quota. Additional details - Deployment Model: Resource Manager, Location: westus2, Current Limit: 100, Current Usage: 96, Additional Required: 32, (Minimum) New Limit Required: 128. Submit a request for Quota increase at https://aka.ms/ProdportalCRP/#blade/Microsoft_Azure_Capacity/UsageAndQuota.ReactView/Parameters/%7B%22subscriptionId%22:%(redacted)%22,%22command%22:%22openQuotaApprovalBlade%22,%22quotas%22:[%7B%22location%22:%22westus2%22,%22providerId%22:%22Microsoft.Compute%22,%22resourceName%22:%22standardDLSv5Family%22,%22quotaRequest%22:%7B%22properties%22:%7B%22limit%22:128,%22unit%22:%22Count%22,%22name%22:%7B%22value%22:%22standardDLSv5Family%22%7D%7D%7D%7D]%7D by specifying parameters listed in the ‘Details’ section for deployment to succeed. Please read more about quota limits at https://docs.microsoft.com/en-us/azure/azure-supportability/per-vm-quota-requests" ExpectApplied(ctx, env.Client, nodePool, nodeClass) @@ -160,6 +189,15 @@ var _ = Describe("InstanceType Provider", func() { pod := coretest.UnschedulablePod() ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod) ExpectNotScheduled(ctx, env.Client, pod) + + // We should have created a nic for the vm + Expect(azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + // The nic we used in the vm create, should be cleaned up if the vm call fails + nic := azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Pop() + Expect(nic).NotTo(BeNil()) + _, ok := azureEnv.NetworkInterfacesAPI.NetworkInterfaces.Load(nic.Interface.ID) + Expect(ok).To(Equal(false)) + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set(nil) pod = coretest.UnschedulablePod() ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod) @@ -179,6 +217,14 @@ var _ = Describe("InstanceType Provider", func() { pod := coretest.UnschedulablePod() ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod) ExpectNotScheduled(ctx, env.Client, pod) + // We should have created a nic for the vm + Expect(azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + // The nic we used in the vm create, should be cleaned up if the vm call fails + nic := azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Pop() + Expect(nic).NotTo(BeNil()) + _, ok := azureEnv.NetworkInterfacesAPI.NetworkInterfaces.Load(nic.Interface.ID) + Expect(ok).To(Equal(false)) + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set(nil) pod = coretest.UnschedulablePod() ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod) diff --git a/pkg/utils/opts/armopts.go b/pkg/utils/opts/armopts.go index 68e5834d9..4c4655076 100644 --- a/pkg/utils/opts/armopts.go +++ b/pkg/utils/opts/armopts.go @@ -25,6 +25,8 @@ import ( "github.com/Azure/karpenter/pkg/auth" ) +var MaxRetries = 20 + func DefaultArmOpts() *arm.ClientOptions { opts := &arm.ClientOptions{} opts.Telemetry = DefaultTelemetryOpts() @@ -33,6 +35,11 @@ func DefaultArmOpts() *arm.ClientOptions { return opts } +func DefaultNICClientOpts() *arm.ClientOptions { + opts := DefaultArmOpts() + return opts +} + func DefaultRetryOpts() policy.RetryOptions { return policy.RetryOptions{ MaxRetries: 20,