From 8d9211c1d6bc3ea4551d51b268469a580184cb52 Mon Sep 17 00:00:00 2001 From: Rachel Gregory Date: Wed, 10 Jan 2024 10:54:10 -0800 Subject: [PATCH 1/3] chore: Small style + usability refactors for new contributors (#86) * Update skaffold.yaml * Update fake assert statement, standardize casing * NewAZClient calls NewAZClientFromAPI --- pkg/fake/azureresourcegraphapi.go | 2 +- pkg/fake/communityimageversionsapi.go | 2 +- pkg/fake/networkinterfaceapi.go | 2 +- pkg/fake/pricingapi.go | 3 +++ pkg/fake/virtualmachineextensionsapi.go | 2 +- pkg/fake/virtualmachinesapi.go | 2 +- pkg/operator/operator.go | 6 +++--- pkg/providers/instance/azure_client.go | 21 +++++++++------------ skaffold.yaml | 20 ++++++++++---------- 9 files changed, 30 insertions(+), 30 deletions(-) diff --git a/pkg/fake/azureresourcegraphapi.go b/pkg/fake/azureresourcegraphapi.go index e3fa4d07f..f6cd1cee3 100644 --- a/pkg/fake/azureresourcegraphapi.go +++ b/pkg/fake/azureresourcegraphapi.go @@ -39,7 +39,7 @@ type AzureResourceGraphBehavior struct { } // assert that the fake implements the interface -var _ instance.AzureResourceGraphAPI = (*AzureResourceGraphAPI)(nil) +var _ instance.AzureResourceGraphAPI = &AzureResourceGraphAPI{} type AzureResourceGraphAPI struct { AzureResourceGraphBehavior diff --git a/pkg/fake/communityimageversionsapi.go b/pkg/fake/communityimageversionsapi.go index c9d8f265c..860874f22 100644 --- a/pkg/fake/communityimageversionsapi.go +++ b/pkg/fake/communityimageversionsapi.go @@ -30,7 +30,7 @@ type CommunityGalleryImageVersionsAPI struct { } // assert that the fake implements the interface -var _ imagefamily.CommunityGalleryImageVersionsAPI = (*CommunityGalleryImageVersionsAPI)(nil) +var _ imagefamily.CommunityGalleryImageVersionsAPI = &CommunityGalleryImageVersionsAPI{} // NewListPager returns a new pager to return the next page of CommunityGalleryImageVersionsClientListResponse func (c *CommunityGalleryImageVersionsAPI) NewListPager(_ string, _ string, _ string, _ *armcompute.CommunityGalleryImageVersionsClientListOptions) *runtime.Pager[armcompute.CommunityGalleryImageVersionsClientListResponse] { diff --git a/pkg/fake/networkinterfaceapi.go b/pkg/fake/networkinterfaceapi.go index 16eb4402c..3847f7bb2 100644 --- a/pkg/fake/networkinterfaceapi.go +++ b/pkg/fake/networkinterfaceapi.go @@ -40,7 +40,7 @@ type NetworkInterfacesBehavior struct { } // assert that the fake implements the interface -var _ instance.NetworkInterfacesAPI = (*NetworkInterfacesAPI)(nil) +var _ instance.NetworkInterfacesAPI = &NetworkInterfacesAPI{} type NetworkInterfacesAPI struct { // instance.NetworkInterfacesAPI diff --git a/pkg/fake/pricingapi.go b/pkg/fake/pricingapi.go index 146e4eb55..3a8f1f377 100644 --- a/pkg/fake/pricingapi.go +++ b/pkg/fake/pricingapi.go @@ -36,6 +36,9 @@ type PricingBehavior struct { ProductsPricePage AtomicPtr[client.ProductsPricePage] } +// assert that the fake implements the interface +var _ client.PricingAPI = &PricingAPI{} + func (p *PricingAPI) Reset() { p.NextError.Reset() p.ProductsPricePage.Reset() diff --git a/pkg/fake/virtualmachineextensionsapi.go b/pkg/fake/virtualmachineextensionsapi.go index e118117d9..be8539695 100644 --- a/pkg/fake/virtualmachineextensionsapi.go +++ b/pkg/fake/virtualmachineextensionsapi.go @@ -40,7 +40,7 @@ type VirtualMachineExtensionsBehavior struct { } // assert that ComputeAPI implements ARMComputeAPI -var _ instance.VirtualMachineExtensionsAPI = (*VirtualMachineExtensionsAPI)(nil) +var _ instance.VirtualMachineExtensionsAPI = &VirtualMachineExtensionsAPI{} type VirtualMachineExtensionsAPI struct { // instance.VirtualMachineExtensionsAPI diff --git a/pkg/fake/virtualmachinesapi.go b/pkg/fake/virtualmachinesapi.go index 2066f20de..722b4966a 100644 --- a/pkg/fake/virtualmachinesapi.go +++ b/pkg/fake/virtualmachinesapi.go @@ -66,7 +66,7 @@ type VirtualMachinesBehavior struct { } // assert that the fake implements the interface -var _ instance.VirtualMachinesAPI = (*VirtualMachinesAPI)(nil) +var _ instance.VirtualMachinesAPI = &VirtualMachinesAPI{} type VirtualMachinesAPI struct { // TODO: document the implications of embedding vs. not embedding the interface here diff --git a/pkg/operator/operator.go b/pkg/operator/operator.go index 866c6c02c..f496e6ee3 100644 --- a/pkg/operator/operator.go +++ b/pkg/operator/operator.go @@ -54,10 +54,10 @@ type Operator struct { } func NewOperator(ctx context.Context, operator *operator.Operator) (context.Context, *Operator) { - azConfig, err := GetAzConfig() + azConfig, err := GetAZConfig() lo.Must0(err, "creating Azure config") // TODO: I assume we prefer this over the cleaner azConfig := lo.Must(GetAzConfig()), as this has a helpful error message? - azClient, err := instance.CreateAzClient(ctx, azConfig) + azClient, err := instance.CreateAZClient(ctx, azConfig) lo.Must0(err, "creating Azure client") unavailableOfferingsCache := azurecache.NewUnavailableOfferings() @@ -125,7 +125,7 @@ func NewOperator(ctx context.Context, operator *operator.Operator) (context.Cont } } -func GetAzConfig() (*auth.Config, error) { +func GetAZConfig() (*auth.Config, error) { cfg, err := auth.BuildAzureConfig() if err != nil { return nil, err diff --git a/pkg/providers/instance/azure_client.go b/pkg/providers/instance/azure_client.go index d51a68781..08c7ee9ce 100644 --- a/pkg/providers/instance/azure_client.go +++ b/pkg/providers/instance/azure_client.go @@ -94,7 +94,7 @@ func NewAZClientFromAPI( } } -func CreateAzClient(ctx context.Context, cfg *auth.Config) (*AZClient, error) { +func CreateAZClient(ctx context.Context, cfg *auth.Config) (*AZClient, error) { // Defaulting env to Azure Public Cloud. env := azure.PublicCloud var err error @@ -132,7 +132,7 @@ func NewAZClient(ctx context.Context, cfg *auth.Config, env *azure.Environment) } opts := armopts.DefaultArmOpts() - extClient, err := armcompute.NewVirtualMachineExtensionsClient(cfg.SubscriptionID, cred, opts) + extensionsClient, err := armcompute.NewVirtualMachineExtensionsClient(cfg.SubscriptionID, cred, opts) if err != nil { return nil, err } @@ -177,14 +177,11 @@ func NewAZClient(ctx context.Context, cfg *auth.Config, env *azure.Environment) // TODO Move this over to track 2 when skewer is migrated skuClient := skuclient.NewSkuClient(ctx, cfg, env) - return &AZClient{ - networkInterfacesClient: interfacesClient, - virtualMachinesClient: virtualMachinesClient, - virtualMachinesExtensionClient: extClient, - azureResourceGraphClient: azureResourceGraphClient, - - ImageVersionsClient: imageVersionsClient, - SKUClient: skuClient, - LoadBalancersClient: loadBalancersClient, - }, nil + return NewAZClientFromAPI(virtualMachinesClient, + azureResourceGraphClient, + extensionsClient, + interfacesClient, + loadBalancersClient, + imageVersionsClient, + skuClient), nil } diff --git a/skaffold.yaml b/skaffold.yaml index cc0eb94c4..a2115e19b 100644 --- a/skaffold.yaml +++ b/skaffold.yaml @@ -9,7 +9,7 @@ build: paths: - '**/*.go' - '**/*.gtpl' - #flags: ['-tags', 'ccp'] + #flags: ['-tags', 'ccp'] manifests: helm: releases: @@ -29,10 +29,10 @@ manifests: drift: true azure: clusterName: karpenter - clusterEndpoint: https://karpenter-tallaxes-azure-k-26fe00-j3imiwu5.hcp.westus2.staging.azmk8s.io:443 - kubeletClientTLSBootstrapToken: "" # TODO: get this from the cluster + clusterEndpoint: "Please run make az-all" + kubeletClientTLSBootstrapToken: "Please run make az-all" # TODO: get this from the cluster # TODO: autogenerate - sshPublicKey: "" + sshPublicKey: "Please run make az-all" networkPlugin: "azure" # TODO: get this from the cluster networkPolicy: "" replicas: 1 # for better debugging experience @@ -46,23 +46,23 @@ manifests: batchMaxDuration: 10s # 60s is a good value for large runs (500+ nodes) env: - name: ARM_SUBSCRIPTION_ID - value: "" + value: "Please run make az-all" - name: LOCATION value: westus2 - name: ARM_USE_MANAGED_IDENTITY_EXTENSION value: "true" - name: ARM_USER_ASSIGNED_IDENTITY_ID - value: "" + value: "Please run make az-all" - name: AZURE_NODE_RESOURCE_GROUP - value: "" + value: "Please run make az-all" - name: AZURE_SUBNET_ID # the id of subnet to create network interfaces on - value: "" + value: "Please run make az-all" - name: LEADER_ELECT # disable leader election for better debugging experience value: "false" - name: AZURE_VNET_NAME - value: "" + value: "Please run make az-all" - name: AZURE_SUBNET_NAME - value: "" + value: "Please run make az-all" # disable HTTP/2 to reduce ARM throttling on large-scale tests; # with this in place write (and read) QPS can be increased too #- name: GODEBUG From f9f94988755400e19c424b52edc10fc175eda46e Mon Sep 17 00:00:00 2001 From: Bryce Soghigian <49734722+Bryce-Soghigian@users.noreply.github.com> Date: Wed, 10 Jan 2024 14:28:28 -0800 Subject: [PATCH 2/3] fix: removing nic after vm creation fails (#68) * fix: removing nic after vm creation fails * test(fakes): adding network interface api BeginDelete functionality * test: adding tests validating network interface is deleted on provisioning failure * feat: removing all resources on launchInstance() failure, and minor refactor of vmName, nicName to be one resourceName * style: gofmt for cleanupResources function * test: retry policy * feat: deleting nics on CloudProvider.Delete() * test: updating network interface api to return proper not found errors * fix: removing nic per call retry policy * fix: better naming on error tests * style: adding some comments to explain rationale behind deletion ordering --------- Co-authored-by: Rachel Gregory --- pkg/fake/networkinterfaceapi.go | 21 ++++++-- pkg/providers/instance/azure_client.go | 1 + pkg/providers/instance/instance.go | 62 +++++++++++++----------- pkg/providers/instancetype/suite_test.go | 48 +++++++++++++++++- pkg/utils/opts/armopts.go | 7 +++ 5 files changed, 105 insertions(+), 34 deletions(-) diff --git a/pkg/fake/networkinterfaceapi.go b/pkg/fake/networkinterfaceapi.go index 3847f7bb2..4537b0beb 100644 --- a/pkg/fake/networkinterfaceapi.go +++ b/pkg/fake/networkinterfaceapi.go @@ -21,6 +21,8 @@ import ( "fmt" "sync" + "github.com/Azure/azure-sdk-for-go-extensions/pkg/errors" + "github.com/Azure/azure-sdk-for-go/sdk/azcore" "github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork" "github.com/Azure/go-autorest/autorest/to" @@ -34,8 +36,13 @@ type NetworkInterfaceCreateOrUpdateInput struct { Options *armnetwork.InterfacesClientBeginCreateOrUpdateOptions } +type NetworkInterfaceDeleteInput struct { + ResourceGroupName, InterfaceName string +} + type NetworkInterfacesBehavior struct { NetworkInterfacesCreateOrUpdateBehavior MockedLRO[NetworkInterfaceCreateOrUpdateInput, armnetwork.InterfacesClientCreateOrUpdateResponse] + NetworkInterfacesDeleteBehavior MockedLRO[NetworkInterfaceDeleteInput, armnetwork.InterfacesClientDeleteResponse] NetworkInterfaces sync.Map } @@ -79,7 +86,7 @@ func (c *NetworkInterfacesAPI) Get(_ context.Context, resourceGroupName string, id := mkNetworkInterfaceID(resourceGroupName, interfaceName) iface, ok := c.NetworkInterfaces.Load(id) if !ok { - return armnetwork.InterfacesClientGetResponse{}, fmt.Errorf("not found") + return armnetwork.InterfacesClientGetResponse{}, &azcore.ResponseError{ErrorCode: errors.ResourceNotFound} } return armnetwork.InterfacesClientGetResponse{ Interface: iface.(armnetwork.Interface), @@ -87,9 +94,15 @@ func (c *NetworkInterfacesAPI) Get(_ context.Context, resourceGroupName string, } func (c *NetworkInterfacesAPI) BeginDelete(_ context.Context, resourceGroupName string, interfaceName string, _ *armnetwork.InterfacesClientBeginDeleteOptions) (*runtime.Poller[armnetwork.InterfacesClientDeleteResponse], error) { - id := mkNetworkInterfaceID(resourceGroupName, interfaceName) - c.NetworkInterfaces.Delete(id) - return nil, nil + input := &NetworkInterfaceDeleteInput{ + ResourceGroupName: resourceGroupName, + InterfaceName: interfaceName, + } + return c.NetworkInterfacesDeleteBehavior.Invoke(input, func(input *NetworkInterfaceDeleteInput) (*armnetwork.InterfacesClientDeleteResponse, error) { + id := mkNetworkInterfaceID(resourceGroupName, interfaceName) + c.NetworkInterfaces.Delete(id) + return &armnetwork.InterfacesClientDeleteResponse{}, nil + }) } func mkNetworkInterfaceID(resourceGroupName, interfaceName string) string { diff --git a/pkg/providers/instance/azure_client.go b/pkg/providers/instance/azure_client.go index 08c7ee9ce..0fb7a6a0b 100644 --- a/pkg/providers/instance/azure_client.go +++ b/pkg/providers/instance/azure_client.go @@ -136,6 +136,7 @@ func NewAZClient(ctx context.Context, cfg *auth.Config, env *azure.Environment) if err != nil { return nil, err } + interfacesClient, err := armnetwork.NewInterfacesClient(cfg.SubscriptionID, cred, opts) if err != nil { return nil, err diff --git a/pkg/providers/instance/instance.go b/pkg/providers/instance/instance.go index e0465c92d..aae8885e9 100644 --- a/pkg/providers/instance/instance.go +++ b/pkg/providers/instance/instance.go @@ -19,6 +19,7 @@ package instance import ( "context" "encoding/json" + "errors" "fmt" "math" "sort" @@ -123,6 +124,9 @@ func (p *Provider) Create(ctx context.Context, nodeClass *v1alpha2.AKSNodeClass, instanceTypes = orderInstanceTypesByPrice(instanceTypes, scheduling.NewNodeSelectorRequirements(nodeClaim.Spec.Requirements...)) vm, instanceType, err := p.launchInstance(ctx, nodeClass, nodeClaim, instanceTypes) if err != nil { + if cleanupErr := p.cleanupAzureResources(ctx, GenerateResourceName(nodeClaim.Name)); cleanupErr != nil { + logging.FromContext(ctx).Errorf("failed to cleanup resources for node claim %s, %w", nodeClaim.Name, cleanupErr) + } return nil, err } zone, err := GetZoneID(vm) @@ -175,27 +179,19 @@ func (p *Provider) List(ctx context.Context) ([]*armcompute.VirtualMachine, erro return vmList, nil } -func (p *Provider) Delete(ctx context.Context, vmName string) error { - logging.FromContext(ctx).Debugf("Deleting virtual machine %s", vmName) - return deleteVirtualMachine(ctx, p.azClient.virtualMachinesClient, p.resourceGroup, vmName) +func (p *Provider) Delete(ctx context.Context, resourceName string) error { + logging.FromContext(ctx).Debugf("Deleting virtual machine %s and associated resources") + return p.cleanupAzureResources(ctx, resourceName) } // createAKSIdentifyingExtension attaches a VM extension to identify that this VM participates in an AKS cluster -func (p *Provider) createAKSIdentifyingExtension( - ctx context.Context, - vmName, _ string, -) error { - var err error +func (p *Provider) createAKSIdentifyingExtension(ctx context.Context, vmName string) (err error) { vmExt := p.getAKSIdentifyingExtension() vmExtName := *vmExt.Name logging.FromContext(ctx).Debugf("Creating virtual machine AKS identifying extension for %s", vmName) v, err := createVirtualMachineExtension(ctx, p.azClient.virtualMachinesExtensionClient, p.resourceGroup, vmName, vmExtName, *vmExt) if err != nil { logging.FromContext(ctx).Errorf("Creating VM AKS identifying extension for VM %q failed, %w", vmName, err) - vmErr := deleteVirtualMachine(ctx, p.azClient.virtualMachinesClient, p.resourceGroup, vmName) - if vmErr != nil { - logging.FromContext(ctx).Errorf("virtualMachine.Delete for %s failed: %v", vmName, vmErr) - } return fmt.Errorf("creating VM AKS identifying extension for VM %q, %w failed", vmName, err) } logging.FromContext(ctx).Debugf("Created virtual machine AKS identifying extension for %s, with an id of %s", vmName, *v.ID) @@ -240,7 +236,7 @@ func (p *Provider) newNetworkInterfaceForVM(vmName string, backendPools *loadbal } } -func GenerateVMName(nodeClaimName string) string { +func GenerateResourceName(nodeClaimName string) string { return fmt.Sprintf("aks-%s", nodeClaimName) } @@ -255,9 +251,6 @@ func (p *Provider) createNetworkInterface(ctx context.Context, nicName string, l logging.FromContext(ctx).Debugf("Creating network interface %s", nicName) res, err := createNic(ctx, p.azClient.networkInterfacesClient, p.resourceGroup, nicName, nic) if err != nil { - if nicErr := deleteNicIfExists(ctx, p.azClient.networkInterfacesClient, p.resourceGroup, nicName); nicErr != nil { - logging.FromContext(ctx).Errorf("networkInterface.Delete for %s failed: %v", nicName, nicErr) - } return "", err } logging.FromContext(ctx).Debugf("Successfully created network interface: %v", *res.ID) @@ -374,14 +367,10 @@ func setVMTagsProvisionerName(tags map[string]*string, nodeClaim *corev1beta1.No } } -func (p *Provider) createVirtualMachine(ctx context.Context, vm armcompute.VirtualMachine, vmName, _ string) (*armcompute.VirtualMachine, error) { +func (p *Provider) createVirtualMachine(ctx context.Context, vm armcompute.VirtualMachine, vmName string) (*armcompute.VirtualMachine, error) { result, err := CreateVirtualMachine(ctx, p.azClient.virtualMachinesClient, p.resourceGroup, vmName, vm) if err != nil { logging.FromContext(ctx).Errorf("Creating virtual machine %q failed: %v", vmName, err) - vmErr := deleteVirtualMachineIfExists(ctx, p.azClient.virtualMachinesClient, p.resourceGroup, vmName) - if vmErr != nil { - logging.FromContext(ctx).Errorf("virtualMachine.Delete for %s failed: %v", vmName, vmErr) - } return nil, fmt.Errorf("virtualMachine.BeginCreateOrUpdate for VM %q failed: %w", vmName, err) } logging.FromContext(ctx).Debugf("Created virtual machine %s", *result.ID) @@ -398,29 +387,28 @@ func (p *Provider) launchInstance( if err != nil { return nil, nil, fmt.Errorf("getting launch template: %w", err) } - - vmName := GenerateVMName(nodeClaim.Name) + // resourceName for the NIC, VM, and Disk + resourceName := GenerateResourceName(nodeClaim.Name) // create network interface - nicName := vmName - nicReference, err := p.createNetworkInterface(ctx, vmName, launchTemplate, instanceType) + nicReference, err := p.createNetworkInterface(ctx, resourceName, launchTemplate, instanceType) if err != nil { return nil, nil, err } sshPublicKey := settings.FromContext(ctx).SSHPublicKey nodeIdentityIDs := settings.FromContext(ctx).NodeIdentities - vm := newVMObject(vmName, nicReference, zone, capacityType, p.location, sshPublicKey, nodeIdentityIDs, nodeClass, nodeClaim, launchTemplate, instanceType) + vm := newVMObject(resourceName, nicReference, zone, capacityType, p.location, sshPublicKey, nodeIdentityIDs, nodeClass, nodeClaim, launchTemplate, instanceType) - logging.FromContext(ctx).Debugf("Creating virtual machine %s (%s)", vmName, instanceType.Name) + logging.FromContext(ctx).Debugf("Creating virtual machine %s (%s)", resourceName, instanceType.Name) // Uses AZ Client to create a new virtual machine using the vm object we prepared earlier - resp, err := p.createVirtualMachine(ctx, vm, vmName, nicName) + resp, err := p.createVirtualMachine(ctx, vm, resourceName) if err != nil { azErr := p.handleResponseErrors(ctx, instanceType, zone, capacityType, err) return nil, nil, azErr } - err = p.createAKSIdentifyingExtension(ctx, vmName, nicName) + err = p.createAKSIdentifyingExtension(ctx, resourceName) if err != nil { return nil, nil, err } @@ -561,6 +549,22 @@ func (p *Provider) pickSkuSizePriorityAndZone(ctx context.Context, nodeClaim *co return nil, "", "" } +func (p *Provider) cleanupAzureResources(ctx context.Context, resourceName string) (err error) { + vmErr := deleteVirtualMachineIfExists(ctx, p.azClient.virtualMachinesClient, p.resourceGroup, resourceName) + if vmErr != nil { + logging.FromContext(ctx).Errorf("virtualMachine.Delete for %s failed: %v", resourceName, vmErr) + } + // The order here is intentional, if the VM was created successfully, then we attempt to delete the vm, the + // nic, disk and all associated resources will be removed. If the VM was not created successfully and a nic was found, + // then we attempt to delete the nic. + nicErr := deleteNicIfExists(ctx, p.azClient.networkInterfacesClient, p.resourceGroup, resourceName) + if nicErr != nil { + logging.FromContext(ctx).Errorf("networkInterface.Delete for %s failed: %v", resourceName, nicErr) + } + + return errors.Join(vmErr, nicErr) +} + // getPriorityForInstanceType selects spot if both constraints are flexible and there is an available offering. // The Azure Cloud Provider defaults to Regular, so spot must be explicitly included in capacity type requirements. // diff --git a/pkg/providers/instancetype/suite_test.go b/pkg/providers/instancetype/suite_test.go index 4a3739fec..2c559794a 100644 --- a/pkg/providers/instancetype/suite_test.go +++ b/pkg/providers/instancetype/suite_test.go @@ -145,7 +145,36 @@ var _ = Describe("InstanceType Provider", func() { ExpectCleanedUp(ctx, env.Client) }) - Context("subscription level quota error responses", func() { + Context("vm creation error responses", func() { + It("should delete the network interface on failure to create the vm", func() { + ErrMsg := "test error" + ErrCode := fmt.Sprint(http.StatusNotFound) + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + azureEnv.VirtualMachinesAPI.VirtualMachinesBehavior.VirtualMachineCreateOrUpdateBehavior.Error.Set( + &azcore.ResponseError{ + ErrorCode: ErrCode, + RawResponse: &http.Response{ + Body: createSDKErrorBody(ErrCode, ErrMsg), + }, + }, + ) + pod := coretest.UnschedulablePod() + ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod) + ExpectNotScheduled(ctx, env.Client, pod) + + // We should have created a nic for the vm + Expect(azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + // The nic we used in the vm create, should be cleaned up if the vm call fails + nic := azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Pop() + Expect(nic).NotTo(BeNil()) + _, ok := azureEnv.NetworkInterfacesAPI.NetworkInterfaces.Load(nic.Interface.ID) + Expect(ok).To(Equal(false)) + + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set(nil) + pod = coretest.UnschedulablePod() + ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod) + ExpectScheduled(ctx, env.Client, pod) + }) It("should fail to provision when VM SKU family vCPU quota exceeded error is returned, and succeed when it is gone", func() { familyVCPUQuotaExceededErrorMessage := "Operation could not be completed as it results in exceeding approved standardDLSv5Family Cores quota. Additional details - Deployment Model: Resource Manager, Location: westus2, Current Limit: 100, Current Usage: 96, Additional Required: 32, (Minimum) New Limit Required: 128. Submit a request for Quota increase at https://aka.ms/ProdportalCRP/#blade/Microsoft_Azure_Capacity/UsageAndQuota.ReactView/Parameters/%7B%22subscriptionId%22:%(redacted)%22,%22command%22:%22openQuotaApprovalBlade%22,%22quotas%22:[%7B%22location%22:%22westus2%22,%22providerId%22:%22Microsoft.Compute%22,%22resourceName%22:%22standardDLSv5Family%22,%22quotaRequest%22:%7B%22properties%22:%7B%22limit%22:128,%22unit%22:%22Count%22,%22name%22:%7B%22value%22:%22standardDLSv5Family%22%7D%7D%7D%7D]%7D by specifying parameters listed in the ‘Details’ section for deployment to succeed. Please read more about quota limits at https://docs.microsoft.com/en-us/azure/azure-supportability/per-vm-quota-requests" ExpectApplied(ctx, env.Client, nodePool, nodeClass) @@ -160,6 +189,15 @@ var _ = Describe("InstanceType Provider", func() { pod := coretest.UnschedulablePod() ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod) ExpectNotScheduled(ctx, env.Client, pod) + + // We should have created a nic for the vm + Expect(azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + // The nic we used in the vm create, should be cleaned up if the vm call fails + nic := azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Pop() + Expect(nic).NotTo(BeNil()) + _, ok := azureEnv.NetworkInterfacesAPI.NetworkInterfaces.Load(nic.Interface.ID) + Expect(ok).To(Equal(false)) + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set(nil) pod = coretest.UnschedulablePod() ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod) @@ -179,6 +217,14 @@ var _ = Describe("InstanceType Provider", func() { pod := coretest.UnschedulablePod() ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod) ExpectNotScheduled(ctx, env.Client, pod) + // We should have created a nic for the vm + Expect(azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + // The nic we used in the vm create, should be cleaned up if the vm call fails + nic := azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Pop() + Expect(nic).NotTo(BeNil()) + _, ok := azureEnv.NetworkInterfacesAPI.NetworkInterfaces.Load(nic.Interface.ID) + Expect(ok).To(Equal(false)) + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set(nil) pod = coretest.UnschedulablePod() ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod) diff --git a/pkg/utils/opts/armopts.go b/pkg/utils/opts/armopts.go index 68e5834d9..4c4655076 100644 --- a/pkg/utils/opts/armopts.go +++ b/pkg/utils/opts/armopts.go @@ -25,6 +25,8 @@ import ( "github.com/Azure/karpenter/pkg/auth" ) +var MaxRetries = 20 + func DefaultArmOpts() *arm.ClientOptions { opts := &arm.ClientOptions{} opts.Telemetry = DefaultTelemetryOpts() @@ -33,6 +35,11 @@ func DefaultArmOpts() *arm.ClientOptions { return opts } +func DefaultNICClientOpts() *arm.ClientOptions { + opts := DefaultArmOpts() + return opts +} + func DefaultRetryOpts() policy.RetryOptions { return policy.RetryOptions{ MaxRetries: 20, From 6edf1bbec8174bb6f9e5285e3c4c0a4bd5c2130c Mon Sep 17 00:00:00 2001 From: Bryce Soghigian <49734722+Bryce-Soghigian@users.noreply.github.com> Date: Wed, 10 Jan 2024 16:22:40 -0800 Subject: [PATCH 3/3] chore: removing unused var (#88) --- pkg/utils/opts/armopts.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/pkg/utils/opts/armopts.go b/pkg/utils/opts/armopts.go index 4c4655076..881c88045 100644 --- a/pkg/utils/opts/armopts.go +++ b/pkg/utils/opts/armopts.go @@ -25,8 +25,6 @@ import ( "github.com/Azure/karpenter/pkg/auth" ) -var MaxRetries = 20 - func DefaultArmOpts() *arm.ClientOptions { opts := &arm.ClientOptions{} opts.Telemetry = DefaultTelemetryOpts()