Skip to content

Commit

Permalink
fix: removing nic after vm creation fails (#68)
Browse files Browse the repository at this point in the history
* fix: removing nic after vm creation fails

* test(fakes): adding network interface api BeginDelete functionality

* test: adding tests validating network interface is deleted on provisioning failure

* feat: removing all resources on launchInstance() failure, and minor refactor of vmName, nicName to be one resourceName

* style: gofmt for cleanupResources function

* test: retry policy

* feat: deleting nics on CloudProvider.Delete()

* test: updating network interface api to return proper not found errors

* fix: removing nic per call retry policy

* fix: better naming on error tests

* style: adding some comments to explain rationale behind deletion ordering

---------

Co-authored-by: Rachel Gregory <[email protected]>
  • Loading branch information
Bryce-Soghigian and rakechill authored Jan 10, 2024
1 parent 8d9211c commit f9f9498
Show file tree
Hide file tree
Showing 5 changed files with 105 additions and 34 deletions.
21 changes: 17 additions & 4 deletions pkg/fake/networkinterfaceapi.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ import (
"fmt"
"sync"

"github.com/Azure/azure-sdk-for-go-extensions/pkg/errors"
"github.com/Azure/azure-sdk-for-go/sdk/azcore"
"github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime"
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork"
"github.com/Azure/go-autorest/autorest/to"
Expand All @@ -34,8 +36,13 @@ type NetworkInterfaceCreateOrUpdateInput struct {
Options *armnetwork.InterfacesClientBeginCreateOrUpdateOptions
}

type NetworkInterfaceDeleteInput struct {
ResourceGroupName, InterfaceName string
}

type NetworkInterfacesBehavior struct {
NetworkInterfacesCreateOrUpdateBehavior MockedLRO[NetworkInterfaceCreateOrUpdateInput, armnetwork.InterfacesClientCreateOrUpdateResponse]
NetworkInterfacesDeleteBehavior MockedLRO[NetworkInterfaceDeleteInput, armnetwork.InterfacesClientDeleteResponse]
NetworkInterfaces sync.Map
}

Expand Down Expand Up @@ -79,17 +86,23 @@ func (c *NetworkInterfacesAPI) Get(_ context.Context, resourceGroupName string,
id := mkNetworkInterfaceID(resourceGroupName, interfaceName)
iface, ok := c.NetworkInterfaces.Load(id)
if !ok {
return armnetwork.InterfacesClientGetResponse{}, fmt.Errorf("not found")
return armnetwork.InterfacesClientGetResponse{}, &azcore.ResponseError{ErrorCode: errors.ResourceNotFound}
}
return armnetwork.InterfacesClientGetResponse{
Interface: iface.(armnetwork.Interface),
}, nil
}

func (c *NetworkInterfacesAPI) BeginDelete(_ context.Context, resourceGroupName string, interfaceName string, _ *armnetwork.InterfacesClientBeginDeleteOptions) (*runtime.Poller[armnetwork.InterfacesClientDeleteResponse], error) {
id := mkNetworkInterfaceID(resourceGroupName, interfaceName)
c.NetworkInterfaces.Delete(id)
return nil, nil
input := &NetworkInterfaceDeleteInput{
ResourceGroupName: resourceGroupName,
InterfaceName: interfaceName,
}
return c.NetworkInterfacesDeleteBehavior.Invoke(input, func(input *NetworkInterfaceDeleteInput) (*armnetwork.InterfacesClientDeleteResponse, error) {
id := mkNetworkInterfaceID(resourceGroupName, interfaceName)
c.NetworkInterfaces.Delete(id)
return &armnetwork.InterfacesClientDeleteResponse{}, nil
})
}

func mkNetworkInterfaceID(resourceGroupName, interfaceName string) string {
Expand Down
1 change: 1 addition & 0 deletions pkg/providers/instance/azure_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ func NewAZClient(ctx context.Context, cfg *auth.Config, env *azure.Environment)
if err != nil {
return nil, err
}

interfacesClient, err := armnetwork.NewInterfacesClient(cfg.SubscriptionID, cred, opts)
if err != nil {
return nil, err
Expand Down
62 changes: 33 additions & 29 deletions pkg/providers/instance/instance.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package instance
import (
"context"
"encoding/json"
"errors"
"fmt"
"math"
"sort"
Expand Down Expand Up @@ -123,6 +124,9 @@ func (p *Provider) Create(ctx context.Context, nodeClass *v1alpha2.AKSNodeClass,
instanceTypes = orderInstanceTypesByPrice(instanceTypes, scheduling.NewNodeSelectorRequirements(nodeClaim.Spec.Requirements...))
vm, instanceType, err := p.launchInstance(ctx, nodeClass, nodeClaim, instanceTypes)
if err != nil {
if cleanupErr := p.cleanupAzureResources(ctx, GenerateResourceName(nodeClaim.Name)); cleanupErr != nil {
logging.FromContext(ctx).Errorf("failed to cleanup resources for node claim %s, %w", nodeClaim.Name, cleanupErr)
}
return nil, err
}
zone, err := GetZoneID(vm)
Expand Down Expand Up @@ -175,27 +179,19 @@ func (p *Provider) List(ctx context.Context) ([]*armcompute.VirtualMachine, erro
return vmList, nil
}

func (p *Provider) Delete(ctx context.Context, vmName string) error {
logging.FromContext(ctx).Debugf("Deleting virtual machine %s", vmName)
return deleteVirtualMachine(ctx, p.azClient.virtualMachinesClient, p.resourceGroup, vmName)
func (p *Provider) Delete(ctx context.Context, resourceName string) error {
logging.FromContext(ctx).Debugf("Deleting virtual machine %s and associated resources")
return p.cleanupAzureResources(ctx, resourceName)
}

// createAKSIdentifyingExtension attaches a VM extension to identify that this VM participates in an AKS cluster
func (p *Provider) createAKSIdentifyingExtension(
ctx context.Context,
vmName, _ string,
) error {
var err error
func (p *Provider) createAKSIdentifyingExtension(ctx context.Context, vmName string) (err error) {
vmExt := p.getAKSIdentifyingExtension()
vmExtName := *vmExt.Name
logging.FromContext(ctx).Debugf("Creating virtual machine AKS identifying extension for %s", vmName)
v, err := createVirtualMachineExtension(ctx, p.azClient.virtualMachinesExtensionClient, p.resourceGroup, vmName, vmExtName, *vmExt)
if err != nil {
logging.FromContext(ctx).Errorf("Creating VM AKS identifying extension for VM %q failed, %w", vmName, err)
vmErr := deleteVirtualMachine(ctx, p.azClient.virtualMachinesClient, p.resourceGroup, vmName)
if vmErr != nil {
logging.FromContext(ctx).Errorf("virtualMachine.Delete for %s failed: %v", vmName, vmErr)
}
return fmt.Errorf("creating VM AKS identifying extension for VM %q, %w failed", vmName, err)
}
logging.FromContext(ctx).Debugf("Created virtual machine AKS identifying extension for %s, with an id of %s", vmName, *v.ID)
Expand Down Expand Up @@ -240,7 +236,7 @@ func (p *Provider) newNetworkInterfaceForVM(vmName string, backendPools *loadbal
}
}

func GenerateVMName(nodeClaimName string) string {
func GenerateResourceName(nodeClaimName string) string {
return fmt.Sprintf("aks-%s", nodeClaimName)
}

Expand All @@ -255,9 +251,6 @@ func (p *Provider) createNetworkInterface(ctx context.Context, nicName string, l
logging.FromContext(ctx).Debugf("Creating network interface %s", nicName)
res, err := createNic(ctx, p.azClient.networkInterfacesClient, p.resourceGroup, nicName, nic)
if err != nil {
if nicErr := deleteNicIfExists(ctx, p.azClient.networkInterfacesClient, p.resourceGroup, nicName); nicErr != nil {
logging.FromContext(ctx).Errorf("networkInterface.Delete for %s failed: %v", nicName, nicErr)
}
return "", err
}
logging.FromContext(ctx).Debugf("Successfully created network interface: %v", *res.ID)
Expand Down Expand Up @@ -374,14 +367,10 @@ func setVMTagsProvisionerName(tags map[string]*string, nodeClaim *corev1beta1.No
}
}

func (p *Provider) createVirtualMachine(ctx context.Context, vm armcompute.VirtualMachine, vmName, _ string) (*armcompute.VirtualMachine, error) {
func (p *Provider) createVirtualMachine(ctx context.Context, vm armcompute.VirtualMachine, vmName string) (*armcompute.VirtualMachine, error) {
result, err := CreateVirtualMachine(ctx, p.azClient.virtualMachinesClient, p.resourceGroup, vmName, vm)
if err != nil {
logging.FromContext(ctx).Errorf("Creating virtual machine %q failed: %v", vmName, err)
vmErr := deleteVirtualMachineIfExists(ctx, p.azClient.virtualMachinesClient, p.resourceGroup, vmName)
if vmErr != nil {
logging.FromContext(ctx).Errorf("virtualMachine.Delete for %s failed: %v", vmName, vmErr)
}
return nil, fmt.Errorf("virtualMachine.BeginCreateOrUpdate for VM %q failed: %w", vmName, err)
}
logging.FromContext(ctx).Debugf("Created virtual machine %s", *result.ID)
Expand All @@ -398,29 +387,28 @@ func (p *Provider) launchInstance(
if err != nil {
return nil, nil, fmt.Errorf("getting launch template: %w", err)
}

vmName := GenerateVMName(nodeClaim.Name)
// resourceName for the NIC, VM, and Disk
resourceName := GenerateResourceName(nodeClaim.Name)

// create network interface
nicName := vmName
nicReference, err := p.createNetworkInterface(ctx, vmName, launchTemplate, instanceType)
nicReference, err := p.createNetworkInterface(ctx, resourceName, launchTemplate, instanceType)
if err != nil {
return nil, nil, err
}

sshPublicKey := settings.FromContext(ctx).SSHPublicKey
nodeIdentityIDs := settings.FromContext(ctx).NodeIdentities
vm := newVMObject(vmName, nicReference, zone, capacityType, p.location, sshPublicKey, nodeIdentityIDs, nodeClass, nodeClaim, launchTemplate, instanceType)
vm := newVMObject(resourceName, nicReference, zone, capacityType, p.location, sshPublicKey, nodeIdentityIDs, nodeClass, nodeClaim, launchTemplate, instanceType)

logging.FromContext(ctx).Debugf("Creating virtual machine %s (%s)", vmName, instanceType.Name)
logging.FromContext(ctx).Debugf("Creating virtual machine %s (%s)", resourceName, instanceType.Name)
// Uses AZ Client to create a new virtual machine using the vm object we prepared earlier
resp, err := p.createVirtualMachine(ctx, vm, vmName, nicName)
resp, err := p.createVirtualMachine(ctx, vm, resourceName)
if err != nil {
azErr := p.handleResponseErrors(ctx, instanceType, zone, capacityType, err)
return nil, nil, azErr
}

err = p.createAKSIdentifyingExtension(ctx, vmName, nicName)
err = p.createAKSIdentifyingExtension(ctx, resourceName)
if err != nil {
return nil, nil, err
}
Expand Down Expand Up @@ -561,6 +549,22 @@ func (p *Provider) pickSkuSizePriorityAndZone(ctx context.Context, nodeClaim *co
return nil, "", ""
}

func (p *Provider) cleanupAzureResources(ctx context.Context, resourceName string) (err error) {
vmErr := deleteVirtualMachineIfExists(ctx, p.azClient.virtualMachinesClient, p.resourceGroup, resourceName)
if vmErr != nil {
logging.FromContext(ctx).Errorf("virtualMachine.Delete for %s failed: %v", resourceName, vmErr)
}
// The order here is intentional, if the VM was created successfully, then we attempt to delete the vm, the
// nic, disk and all associated resources will be removed. If the VM was not created successfully and a nic was found,
// then we attempt to delete the nic.
nicErr := deleteNicIfExists(ctx, p.azClient.networkInterfacesClient, p.resourceGroup, resourceName)
if nicErr != nil {
logging.FromContext(ctx).Errorf("networkInterface.Delete for %s failed: %v", resourceName, nicErr)
}

return errors.Join(vmErr, nicErr)
}

// getPriorityForInstanceType selects spot if both constraints are flexible and there is an available offering.
// The Azure Cloud Provider defaults to Regular, so spot must be explicitly included in capacity type requirements.
//
Expand Down
48 changes: 47 additions & 1 deletion pkg/providers/instancetype/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,36 @@ var _ = Describe("InstanceType Provider", func() {
ExpectCleanedUp(ctx, env.Client)
})

Context("subscription level quota error responses", func() {
Context("vm creation error responses", func() {
It("should delete the network interface on failure to create the vm", func() {
ErrMsg := "test error"
ErrCode := fmt.Sprint(http.StatusNotFound)
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
azureEnv.VirtualMachinesAPI.VirtualMachinesBehavior.VirtualMachineCreateOrUpdateBehavior.Error.Set(
&azcore.ResponseError{
ErrorCode: ErrCode,
RawResponse: &http.Response{
Body: createSDKErrorBody(ErrCode, ErrMsg),
},
},
)
pod := coretest.UnschedulablePod()
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
ExpectNotScheduled(ctx, env.Client, pod)

// We should have created a nic for the vm
Expect(azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1))
// The nic we used in the vm create, should be cleaned up if the vm call fails
nic := azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Pop()
Expect(nic).NotTo(BeNil())
_, ok := azureEnv.NetworkInterfacesAPI.NetworkInterfaces.Load(nic.Interface.ID)
Expect(ok).To(Equal(false))

azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set(nil)
pod = coretest.UnschedulablePod()
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
ExpectScheduled(ctx, env.Client, pod)
})
It("should fail to provision when VM SKU family vCPU quota exceeded error is returned, and succeed when it is gone", func() {
familyVCPUQuotaExceededErrorMessage := "Operation could not be completed as it results in exceeding approved standardDLSv5Family Cores quota. Additional details - Deployment Model: Resource Manager, Location: westus2, Current Limit: 100, Current Usage: 96, Additional Required: 32, (Minimum) New Limit Required: 128. Submit a request for Quota increase at https://aka.ms/ProdportalCRP/#blade/Microsoft_Azure_Capacity/UsageAndQuota.ReactView/Parameters/%7B%22subscriptionId%22:%(redacted)%22,%22command%22:%22openQuotaApprovalBlade%22,%22quotas%22:[%7B%22location%22:%22westus2%22,%22providerId%22:%22Microsoft.Compute%22,%22resourceName%22:%22standardDLSv5Family%22,%22quotaRequest%22:%7B%22properties%22:%7B%22limit%22:128,%22unit%22:%22Count%22,%22name%22:%7B%22value%22:%22standardDLSv5Family%22%7D%7D%7D%7D]%7D by specifying parameters listed in the ‘Details’ section for deployment to succeed. Please read more about quota limits at https://docs.microsoft.com/en-us/azure/azure-supportability/per-vm-quota-requests"
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
Expand All @@ -160,6 +189,15 @@ var _ = Describe("InstanceType Provider", func() {
pod := coretest.UnschedulablePod()
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
ExpectNotScheduled(ctx, env.Client, pod)

// We should have created a nic for the vm
Expect(azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1))
// The nic we used in the vm create, should be cleaned up if the vm call fails
nic := azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Pop()
Expect(nic).NotTo(BeNil())
_, ok := azureEnv.NetworkInterfacesAPI.NetworkInterfaces.Load(nic.Interface.ID)
Expect(ok).To(Equal(false))

azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set(nil)
pod = coretest.UnschedulablePod()
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
Expand All @@ -179,6 +217,14 @@ var _ = Describe("InstanceType Provider", func() {
pod := coretest.UnschedulablePod()
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
ExpectNotScheduled(ctx, env.Client, pod)
// We should have created a nic for the vm
Expect(azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1))
// The nic we used in the vm create, should be cleaned up if the vm call fails
nic := azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Pop()
Expect(nic).NotTo(BeNil())
_, ok := azureEnv.NetworkInterfacesAPI.NetworkInterfaces.Load(nic.Interface.ID)
Expect(ok).To(Equal(false))

azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set(nil)
pod = coretest.UnschedulablePod()
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
Expand Down
7 changes: 7 additions & 0 deletions pkg/utils/opts/armopts.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ import (
"github.com/Azure/karpenter/pkg/auth"
)

var MaxRetries = 20

func DefaultArmOpts() *arm.ClientOptions {
opts := &arm.ClientOptions{}
opts.Telemetry = DefaultTelemetryOpts()
Expand All @@ -33,6 +35,11 @@ func DefaultArmOpts() *arm.ClientOptions {
return opts
}

func DefaultNICClientOpts() *arm.ClientOptions {
opts := DefaultArmOpts()
return opts
}

func DefaultRetryOpts() policy.RetryOptions {
return policy.RetryOptions{
MaxRetries: 20,
Expand Down

0 comments on commit f9f9498

Please sign in to comment.