Skip to content

Commit

Permalink
Merge branch 'main' into bsoghigian/starter-readme
Browse files Browse the repository at this point in the history
  • Loading branch information
Bryce-Soghigian authored Jan 11, 2024
2 parents dd2074d + 6edf1bb commit 793bbf3
Show file tree
Hide file tree
Showing 12 changed files with 133 additions and 64 deletions.
2 changes: 1 addition & 1 deletion pkg/fake/azureresourcegraphapi.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ type AzureResourceGraphBehavior struct {
}

// assert that the fake implements the interface
var _ instance.AzureResourceGraphAPI = (*AzureResourceGraphAPI)(nil)
var _ instance.AzureResourceGraphAPI = &AzureResourceGraphAPI{}

type AzureResourceGraphAPI struct {
AzureResourceGraphBehavior
Expand Down
2 changes: 1 addition & 1 deletion pkg/fake/communityimageversionsapi.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ type CommunityGalleryImageVersionsAPI struct {
}

// assert that the fake implements the interface
var _ imagefamily.CommunityGalleryImageVersionsAPI = (*CommunityGalleryImageVersionsAPI)(nil)
var _ imagefamily.CommunityGalleryImageVersionsAPI = &CommunityGalleryImageVersionsAPI{}

// NewListPager returns a new pager to return the next page of CommunityGalleryImageVersionsClientListResponse
func (c *CommunityGalleryImageVersionsAPI) NewListPager(_ string, _ string, _ string, _ *armcompute.CommunityGalleryImageVersionsClientListOptions) *runtime.Pager[armcompute.CommunityGalleryImageVersionsClientListResponse] {
Expand Down
23 changes: 18 additions & 5 deletions pkg/fake/networkinterfaceapi.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ import (
"fmt"
"sync"

"github.com/Azure/azure-sdk-for-go-extensions/pkg/errors"
"github.com/Azure/azure-sdk-for-go/sdk/azcore"
"github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime"
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork"
"github.com/Azure/go-autorest/autorest/to"
Expand All @@ -34,13 +36,18 @@ type NetworkInterfaceCreateOrUpdateInput struct {
Options *armnetwork.InterfacesClientBeginCreateOrUpdateOptions
}

type NetworkInterfaceDeleteInput struct {
ResourceGroupName, InterfaceName string
}

type NetworkInterfacesBehavior struct {
NetworkInterfacesCreateOrUpdateBehavior MockedLRO[NetworkInterfaceCreateOrUpdateInput, armnetwork.InterfacesClientCreateOrUpdateResponse]
NetworkInterfacesDeleteBehavior MockedLRO[NetworkInterfaceDeleteInput, armnetwork.InterfacesClientDeleteResponse]
NetworkInterfaces sync.Map
}

// assert that the fake implements the interface
var _ instance.NetworkInterfacesAPI = (*NetworkInterfacesAPI)(nil)
var _ instance.NetworkInterfacesAPI = &NetworkInterfacesAPI{}

type NetworkInterfacesAPI struct {
// instance.NetworkInterfacesAPI
Expand Down Expand Up @@ -79,17 +86,23 @@ func (c *NetworkInterfacesAPI) Get(_ context.Context, resourceGroupName string,
id := mkNetworkInterfaceID(resourceGroupName, interfaceName)
iface, ok := c.NetworkInterfaces.Load(id)
if !ok {
return armnetwork.InterfacesClientGetResponse{}, fmt.Errorf("not found")
return armnetwork.InterfacesClientGetResponse{}, &azcore.ResponseError{ErrorCode: errors.ResourceNotFound}
}
return armnetwork.InterfacesClientGetResponse{
Interface: iface.(armnetwork.Interface),
}, nil
}

func (c *NetworkInterfacesAPI) BeginDelete(_ context.Context, resourceGroupName string, interfaceName string, _ *armnetwork.InterfacesClientBeginDeleteOptions) (*runtime.Poller[armnetwork.InterfacesClientDeleteResponse], error) {
id := mkNetworkInterfaceID(resourceGroupName, interfaceName)
c.NetworkInterfaces.Delete(id)
return nil, nil
input := &NetworkInterfaceDeleteInput{
ResourceGroupName: resourceGroupName,
InterfaceName: interfaceName,
}
return c.NetworkInterfacesDeleteBehavior.Invoke(input, func(input *NetworkInterfaceDeleteInput) (*armnetwork.InterfacesClientDeleteResponse, error) {
id := mkNetworkInterfaceID(resourceGroupName, interfaceName)
c.NetworkInterfaces.Delete(id)
return &armnetwork.InterfacesClientDeleteResponse{}, nil
})
}

func mkNetworkInterfaceID(resourceGroupName, interfaceName string) string {
Expand Down
3 changes: 3 additions & 0 deletions pkg/fake/pricingapi.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ type PricingBehavior struct {
ProductsPricePage AtomicPtr[client.ProductsPricePage]
}

// assert that the fake implements the interface
var _ client.PricingAPI = &PricingAPI{}

func (p *PricingAPI) Reset() {
p.NextError.Reset()
p.ProductsPricePage.Reset()
Expand Down
2 changes: 1 addition & 1 deletion pkg/fake/virtualmachineextensionsapi.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ type VirtualMachineExtensionsBehavior struct {
}

// assert that ComputeAPI implements ARMComputeAPI
var _ instance.VirtualMachineExtensionsAPI = (*VirtualMachineExtensionsAPI)(nil)
var _ instance.VirtualMachineExtensionsAPI = &VirtualMachineExtensionsAPI{}

type VirtualMachineExtensionsAPI struct {
// instance.VirtualMachineExtensionsAPI
Expand Down
2 changes: 1 addition & 1 deletion pkg/fake/virtualmachinesapi.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ type VirtualMachinesBehavior struct {
}

// assert that the fake implements the interface
var _ instance.VirtualMachinesAPI = (*VirtualMachinesAPI)(nil)
var _ instance.VirtualMachinesAPI = &VirtualMachinesAPI{}

type VirtualMachinesAPI struct {
// TODO: document the implications of embedding vs. not embedding the interface here
Expand Down
6 changes: 3 additions & 3 deletions pkg/operator/operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,10 @@ type Operator struct {
}

func NewOperator(ctx context.Context, operator *operator.Operator) (context.Context, *Operator) {
azConfig, err := GetAzConfig()
azConfig, err := GetAZConfig()
lo.Must0(err, "creating Azure config") // TODO: I assume we prefer this over the cleaner azConfig := lo.Must(GetAzConfig()), as this has a helpful error message?

azClient, err := instance.CreateAzClient(ctx, azConfig)
azClient, err := instance.CreateAZClient(ctx, azConfig)
lo.Must0(err, "creating Azure client")

unavailableOfferingsCache := azurecache.NewUnavailableOfferings()
Expand Down Expand Up @@ -125,7 +125,7 @@ func NewOperator(ctx context.Context, operator *operator.Operator) (context.Cont
}
}

func GetAzConfig() (*auth.Config, error) {
func GetAZConfig() (*auth.Config, error) {
cfg, err := auth.BuildAzureConfig()
if err != nil {
return nil, err
Expand Down
22 changes: 10 additions & 12 deletions pkg/providers/instance/azure_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ func NewAZClientFromAPI(
}
}

func CreateAzClient(ctx context.Context, cfg *auth.Config) (*AZClient, error) {
func CreateAZClient(ctx context.Context, cfg *auth.Config) (*AZClient, error) {
// Defaulting env to Azure Public Cloud.
env := azure.PublicCloud
var err error
Expand Down Expand Up @@ -132,10 +132,11 @@ func NewAZClient(ctx context.Context, cfg *auth.Config, env *azure.Environment)
}

opts := armopts.DefaultArmOpts()
extClient, err := armcompute.NewVirtualMachineExtensionsClient(cfg.SubscriptionID, cred, opts)
extensionsClient, err := armcompute.NewVirtualMachineExtensionsClient(cfg.SubscriptionID, cred, opts)
if err != nil {
return nil, err
}

interfacesClient, err := armnetwork.NewInterfacesClient(cfg.SubscriptionID, cred, opts)
if err != nil {
return nil, err
Expand Down Expand Up @@ -177,14 +178,11 @@ func NewAZClient(ctx context.Context, cfg *auth.Config, env *azure.Environment)
// TODO Move this over to track 2 when skewer is migrated
skuClient := skuclient.NewSkuClient(ctx, cfg, env)

return &AZClient{
networkInterfacesClient: interfacesClient,
virtualMachinesClient: virtualMachinesClient,
virtualMachinesExtensionClient: extClient,
azureResourceGraphClient: azureResourceGraphClient,

ImageVersionsClient: imageVersionsClient,
SKUClient: skuClient,
LoadBalancersClient: loadBalancersClient,
}, nil
return NewAZClientFromAPI(virtualMachinesClient,
azureResourceGraphClient,
extensionsClient,
interfacesClient,
loadBalancersClient,
imageVersionsClient,
skuClient), nil
}
62 changes: 33 additions & 29 deletions pkg/providers/instance/instance.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package instance
import (
"context"
"encoding/json"
"errors"
"fmt"
"math"
"sort"
Expand Down Expand Up @@ -123,6 +124,9 @@ func (p *Provider) Create(ctx context.Context, nodeClass *v1alpha2.AKSNodeClass,
instanceTypes = orderInstanceTypesByPrice(instanceTypes, scheduling.NewNodeSelectorRequirements(nodeClaim.Spec.Requirements...))
vm, instanceType, err := p.launchInstance(ctx, nodeClass, nodeClaim, instanceTypes)
if err != nil {
if cleanupErr := p.cleanupAzureResources(ctx, GenerateResourceName(nodeClaim.Name)); cleanupErr != nil {
logging.FromContext(ctx).Errorf("failed to cleanup resources for node claim %s, %w", nodeClaim.Name, cleanupErr)
}
return nil, err
}
zone, err := GetZoneID(vm)
Expand Down Expand Up @@ -175,27 +179,19 @@ func (p *Provider) List(ctx context.Context) ([]*armcompute.VirtualMachine, erro
return vmList, nil
}

func (p *Provider) Delete(ctx context.Context, vmName string) error {
logging.FromContext(ctx).Debugf("Deleting virtual machine %s", vmName)
return deleteVirtualMachine(ctx, p.azClient.virtualMachinesClient, p.resourceGroup, vmName)
func (p *Provider) Delete(ctx context.Context, resourceName string) error {
logging.FromContext(ctx).Debugf("Deleting virtual machine %s and associated resources")
return p.cleanupAzureResources(ctx, resourceName)
}

// createAKSIdentifyingExtension attaches a VM extension to identify that this VM participates in an AKS cluster
func (p *Provider) createAKSIdentifyingExtension(
ctx context.Context,
vmName, _ string,
) error {
var err error
func (p *Provider) createAKSIdentifyingExtension(ctx context.Context, vmName string) (err error) {
vmExt := p.getAKSIdentifyingExtension()
vmExtName := *vmExt.Name
logging.FromContext(ctx).Debugf("Creating virtual machine AKS identifying extension for %s", vmName)
v, err := createVirtualMachineExtension(ctx, p.azClient.virtualMachinesExtensionClient, p.resourceGroup, vmName, vmExtName, *vmExt)
if err != nil {
logging.FromContext(ctx).Errorf("Creating VM AKS identifying extension for VM %q failed, %w", vmName, err)
vmErr := deleteVirtualMachine(ctx, p.azClient.virtualMachinesClient, p.resourceGroup, vmName)
if vmErr != nil {
logging.FromContext(ctx).Errorf("virtualMachine.Delete for %s failed: %v", vmName, vmErr)
}
return fmt.Errorf("creating VM AKS identifying extension for VM %q, %w failed", vmName, err)
}
logging.FromContext(ctx).Debugf("Created virtual machine AKS identifying extension for %s, with an id of %s", vmName, *v.ID)
Expand Down Expand Up @@ -240,7 +236,7 @@ func (p *Provider) newNetworkInterfaceForVM(vmName string, backendPools *loadbal
}
}

func GenerateVMName(nodeClaimName string) string {
func GenerateResourceName(nodeClaimName string) string {
return fmt.Sprintf("aks-%s", nodeClaimName)
}

Expand All @@ -255,9 +251,6 @@ func (p *Provider) createNetworkInterface(ctx context.Context, nicName string, l
logging.FromContext(ctx).Debugf("Creating network interface %s", nicName)
res, err := createNic(ctx, p.azClient.networkInterfacesClient, p.resourceGroup, nicName, nic)
if err != nil {
if nicErr := deleteNicIfExists(ctx, p.azClient.networkInterfacesClient, p.resourceGroup, nicName); nicErr != nil {
logging.FromContext(ctx).Errorf("networkInterface.Delete for %s failed: %v", nicName, nicErr)
}
return "", err
}
logging.FromContext(ctx).Debugf("Successfully created network interface: %v", *res.ID)
Expand Down Expand Up @@ -374,14 +367,10 @@ func setVMTagsProvisionerName(tags map[string]*string, nodeClaim *corev1beta1.No
}
}

func (p *Provider) createVirtualMachine(ctx context.Context, vm armcompute.VirtualMachine, vmName, _ string) (*armcompute.VirtualMachine, error) {
func (p *Provider) createVirtualMachine(ctx context.Context, vm armcompute.VirtualMachine, vmName string) (*armcompute.VirtualMachine, error) {
result, err := CreateVirtualMachine(ctx, p.azClient.virtualMachinesClient, p.resourceGroup, vmName, vm)
if err != nil {
logging.FromContext(ctx).Errorf("Creating virtual machine %q failed: %v", vmName, err)
vmErr := deleteVirtualMachineIfExists(ctx, p.azClient.virtualMachinesClient, p.resourceGroup, vmName)
if vmErr != nil {
logging.FromContext(ctx).Errorf("virtualMachine.Delete for %s failed: %v", vmName, vmErr)
}
return nil, fmt.Errorf("virtualMachine.BeginCreateOrUpdate for VM %q failed: %w", vmName, err)
}
logging.FromContext(ctx).Debugf("Created virtual machine %s", *result.ID)
Expand All @@ -398,29 +387,28 @@ func (p *Provider) launchInstance(
if err != nil {
return nil, nil, fmt.Errorf("getting launch template: %w", err)
}

vmName := GenerateVMName(nodeClaim.Name)
// resourceName for the NIC, VM, and Disk
resourceName := GenerateResourceName(nodeClaim.Name)

// create network interface
nicName := vmName
nicReference, err := p.createNetworkInterface(ctx, vmName, launchTemplate, instanceType)
nicReference, err := p.createNetworkInterface(ctx, resourceName, launchTemplate, instanceType)
if err != nil {
return nil, nil, err
}

sshPublicKey := settings.FromContext(ctx).SSHPublicKey
nodeIdentityIDs := settings.FromContext(ctx).NodeIdentities
vm := newVMObject(vmName, nicReference, zone, capacityType, p.location, sshPublicKey, nodeIdentityIDs, nodeClass, nodeClaim, launchTemplate, instanceType)
vm := newVMObject(resourceName, nicReference, zone, capacityType, p.location, sshPublicKey, nodeIdentityIDs, nodeClass, nodeClaim, launchTemplate, instanceType)

logging.FromContext(ctx).Debugf("Creating virtual machine %s (%s)", vmName, instanceType.Name)
logging.FromContext(ctx).Debugf("Creating virtual machine %s (%s)", resourceName, instanceType.Name)
// Uses AZ Client to create a new virtual machine using the vm object we prepared earlier
resp, err := p.createVirtualMachine(ctx, vm, vmName, nicName)
resp, err := p.createVirtualMachine(ctx, vm, resourceName)
if err != nil {
azErr := p.handleResponseErrors(ctx, instanceType, zone, capacityType, err)
return nil, nil, azErr
}

err = p.createAKSIdentifyingExtension(ctx, vmName, nicName)
err = p.createAKSIdentifyingExtension(ctx, resourceName)
if err != nil {
return nil, nil, err
}
Expand Down Expand Up @@ -561,6 +549,22 @@ func (p *Provider) pickSkuSizePriorityAndZone(ctx context.Context, nodeClaim *co
return nil, "", ""
}

func (p *Provider) cleanupAzureResources(ctx context.Context, resourceName string) (err error) {
vmErr := deleteVirtualMachineIfExists(ctx, p.azClient.virtualMachinesClient, p.resourceGroup, resourceName)
if vmErr != nil {
logging.FromContext(ctx).Errorf("virtualMachine.Delete for %s failed: %v", resourceName, vmErr)
}
// The order here is intentional, if the VM was created successfully, then we attempt to delete the vm, the
// nic, disk and all associated resources will be removed. If the VM was not created successfully and a nic was found,
// then we attempt to delete the nic.
nicErr := deleteNicIfExists(ctx, p.azClient.networkInterfacesClient, p.resourceGroup, resourceName)
if nicErr != nil {
logging.FromContext(ctx).Errorf("networkInterface.Delete for %s failed: %v", resourceName, nicErr)
}

return errors.Join(vmErr, nicErr)
}

// getPriorityForInstanceType selects spot if both constraints are flexible and there is an available offering.
// The Azure Cloud Provider defaults to Regular, so spot must be explicitly included in capacity type requirements.
//
Expand Down
48 changes: 47 additions & 1 deletion pkg/providers/instancetype/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,36 @@ var _ = Describe("InstanceType Provider", func() {
ExpectCleanedUp(ctx, env.Client)
})

Context("subscription level quota error responses", func() {
Context("vm creation error responses", func() {
It("should delete the network interface on failure to create the vm", func() {
ErrMsg := "test error"
ErrCode := fmt.Sprint(http.StatusNotFound)
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
azureEnv.VirtualMachinesAPI.VirtualMachinesBehavior.VirtualMachineCreateOrUpdateBehavior.Error.Set(
&azcore.ResponseError{
ErrorCode: ErrCode,
RawResponse: &http.Response{
Body: createSDKErrorBody(ErrCode, ErrMsg),
},
},
)
pod := coretest.UnschedulablePod()
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
ExpectNotScheduled(ctx, env.Client, pod)

// We should have created a nic for the vm
Expect(azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1))
// The nic we used in the vm create, should be cleaned up if the vm call fails
nic := azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Pop()
Expect(nic).NotTo(BeNil())
_, ok := azureEnv.NetworkInterfacesAPI.NetworkInterfaces.Load(nic.Interface.ID)
Expect(ok).To(Equal(false))

azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set(nil)
pod = coretest.UnschedulablePod()
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
ExpectScheduled(ctx, env.Client, pod)
})
It("should fail to provision when VM SKU family vCPU quota exceeded error is returned, and succeed when it is gone", func() {
familyVCPUQuotaExceededErrorMessage := "Operation could not be completed as it results in exceeding approved standardDLSv5Family Cores quota. Additional details - Deployment Model: Resource Manager, Location: westus2, Current Limit: 100, Current Usage: 96, Additional Required: 32, (Minimum) New Limit Required: 128. Submit a request for Quota increase at https://aka.ms/ProdportalCRP/#blade/Microsoft_Azure_Capacity/UsageAndQuota.ReactView/Parameters/%7B%22subscriptionId%22:%(redacted)%22,%22command%22:%22openQuotaApprovalBlade%22,%22quotas%22:[%7B%22location%22:%22westus2%22,%22providerId%22:%22Microsoft.Compute%22,%22resourceName%22:%22standardDLSv5Family%22,%22quotaRequest%22:%7B%22properties%22:%7B%22limit%22:128,%22unit%22:%22Count%22,%22name%22:%7B%22value%22:%22standardDLSv5Family%22%7D%7D%7D%7D]%7D by specifying parameters listed in the ‘Details’ section for deployment to succeed. Please read more about quota limits at https://docs.microsoft.com/en-us/azure/azure-supportability/per-vm-quota-requests"
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
Expand All @@ -160,6 +189,15 @@ var _ = Describe("InstanceType Provider", func() {
pod := coretest.UnschedulablePod()
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
ExpectNotScheduled(ctx, env.Client, pod)

// We should have created a nic for the vm
Expect(azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1))
// The nic we used in the vm create, should be cleaned up if the vm call fails
nic := azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Pop()
Expect(nic).NotTo(BeNil())
_, ok := azureEnv.NetworkInterfacesAPI.NetworkInterfaces.Load(nic.Interface.ID)
Expect(ok).To(Equal(false))

azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set(nil)
pod = coretest.UnschedulablePod()
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
Expand All @@ -179,6 +217,14 @@ var _ = Describe("InstanceType Provider", func() {
pod := coretest.UnschedulablePod()
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
ExpectNotScheduled(ctx, env.Client, pod)
// We should have created a nic for the vm
Expect(azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1))
// The nic we used in the vm create, should be cleaned up if the vm call fails
nic := azureEnv.NetworkInterfacesAPI.NetworkInterfacesCreateOrUpdateBehavior.CalledWithInput.Pop()
Expect(nic).NotTo(BeNil())
_, ok := azureEnv.NetworkInterfacesAPI.NetworkInterfaces.Load(nic.Interface.ID)
Expect(ok).To(Equal(false))

azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set(nil)
pod = coretest.UnschedulablePod()
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
Expand Down
5 changes: 5 additions & 0 deletions pkg/utils/opts/armopts.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@ func DefaultArmOpts() *arm.ClientOptions {
return opts
}

func DefaultNICClientOpts() *arm.ClientOptions {
opts := DefaultArmOpts()
return opts
}

func DefaultRetryOpts() policy.RetryOptions {
return policy.RetryOptions{
MaxRetries: 20,
Expand Down
Loading

0 comments on commit 793bbf3

Please sign in to comment.