From 454c0add4838197e9bbb416460fa69423dd88c3f Mon Sep 17 00:00:00 2001 From: Bryce Soghigian <49734722+Bryce-Soghigian@users.noreply.github.com> Date: Fri, 10 Jan 2025 21:58:17 +0000 Subject: [PATCH] fix: bootstrap.sh --- pkg/controllers/controllers.go | 7 +- .../instance_garbagecollection.go | 15 ++-- .../nic_garbagecollection.go | 81 ++++++++++--------- .../nodeclaim/garbagecollection/suite_test.go | 1 - pkg/fake/azureresourcegraphapi.go | 11 +-- pkg/providers/instance/arglist.go | 16 ++++ pkg/providers/instance/instance.go | 9 +-- pkg/providers/instance/suite_test.go | 2 +- pkg/test/environment.go | 6 +- 9 files changed, 83 insertions(+), 65 deletions(-) diff --git a/pkg/controllers/controllers.go b/pkg/controllers/controllers.go index cb0c006d0..8e3df9e41 100644 --- a/pkg/controllers/controllers.go +++ b/pkg/controllers/controllers.go @@ -42,17 +42,16 @@ import ( func NewControllers(ctx context.Context, mgr manager.Manager, kubeClient client.Client, recorder events.Recorder, cloudProvider cloudprovider.CloudProvider, instanceProvider instance.Provider) []controller.Controller { - - unremovableNics := cache.New(nodeclaimgarbagecollection.NicReservationDuration, time.Second * 30) + unremovableNics := cache.New(nodeclaimgarbagecollection.NicReservationDuration, time.Second*30) controllers := []controller.Controller{ nodeclasshash.NewController(kubeClient), nodeclassstatus.NewController(kubeClient), nodeclasstermination.NewController(kubeClient, recorder), - + // resources the instance provider creates are garbage collected by these controllers nodeclaimgarbagecollection.NewVirtualMachineController(kubeClient, cloudProvider, unremovableNics), - nodeclaimgarbagecollection.NewNetworkInterfaceController(kubeClient, instanceProvider, unremovableNics), + nodeclaimgarbagecollection.NewNetworkInterfaceController(kubeClient, instanceProvider, unremovableNics), // TODO: nodeclaim tagging inplaceupdate.NewController(kubeClient, instanceProvider), diff --git a/pkg/controllers/nodeclaim/garbagecollection/instance_garbagecollection.go b/pkg/controllers/nodeclaim/garbagecollection/instance_garbagecollection.go index a38f2f8a2..faaf9d241 100644 --- a/pkg/controllers/nodeclaim/garbagecollection/instance_garbagecollection.go +++ b/pkg/controllers/nodeclaim/garbagecollection/instance_garbagecollection.go @@ -42,16 +42,15 @@ import ( corecloudprovider "sigs.k8s.io/karpenter/pkg/cloudprovider" ) - -const ( - NicBelongsToVM = "NicBelongsToVM" +const ( + NicBelongsToVM = "NicBelongsToVM" ) type VirtualMachineController struct { kubeClient client.Client cloudProvider corecloudprovider.CloudProvider - unremovableNics *cache.Cache - successfulCount uint64 // keeps track of successful reconciles for more aggressive requeueing near the start of the controller + unremovableNics *cache.Cache + successfulCount uint64 // keeps track of successful reconciles for more aggressive requeuing near the start of the controller } func NewVirtualMachineController(kubeClient client.Client, cloudProvider corecloudprovider.CloudProvider, unremovableNics *cache.Cache) *VirtualMachineController { @@ -74,11 +73,11 @@ func (c *VirtualMachineController) Reconcile(ctx context.Context) (reconcile.Res return reconcile.Result{}, fmt.Errorf("listing cloudprovider VMs, %w", err) } - // Mark all vms on the cloudprovider as unremovableNics for the nicGc controller + // Mark all vms on the cloudprovider as unremovableNics for the nicGc controller for _, nodeClaim := range retrieved { - // Nics Belonging to a vm cannot be removed while attached to the vm. + // Nics Belonging to a vm cannot be removed while attached to the vm. // lets set a nic as unremovable for 15 minutes if it belongs to a vm - c.unremovableNics.Set(instance.GenerateResourceName(nodeClaim.Name), "", time.Minute * 15) + c.unremovableNics.Set(instance.GenerateResourceName(nodeClaim.Name), "", time.Minute*15) } managedRetrieved := lo.Filter(retrieved, func(nc *karpv1.NodeClaim, _ int) bool { diff --git a/pkg/controllers/nodeclaim/garbagecollection/nic_garbagecollection.go b/pkg/controllers/nodeclaim/garbagecollection/nic_garbagecollection.go index 45502e1cc..a77abc759 100644 --- a/pkg/controllers/nodeclaim/garbagecollection/nic_garbagecollection.go +++ b/pkg/controllers/nodeclaim/garbagecollection/nic_garbagecollection.go @@ -1,16 +1,30 @@ +/* +Portions Copyright (c) Microsoft Corporation. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package garbagecollection import ( - "time" "context" "fmt" + "time" - - "github.com/samber/lo" "github.com/patrickmn/go-cache" + "github.com/samber/lo" "knative.dev/pkg/logging" - karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1" "github.com/awslabs/operatorpkg/singleton" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/client-go/util/workqueue" @@ -18,80 +32,75 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/reconcile" + karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1" "sigs.k8s.io/karpenter/pkg/operator/injection" - - "github.com/Azure/karpenter-provider-azure/pkg/providers/instance" sdkerrors "github.com/Azure/azure-sdk-for-go-extensions/pkg/errors" + "github.com/Azure/karpenter-provider-azure/pkg/providers/instance" ) - const ( - NICGCControllerName = "networkinterface.garbagecollection" + NICGCControllerName = "networkinterface.garbagecollection" NicReservationDuration = time.Second * 180 ) - - type NetworkInterfaceController struct { - kubeClient client.Client + kubeClient client.Client instanceProvider instance.Provider - // A networkInterface is considered unremovable if it meets the following 3 criteria + // A networkInterface is considered unremovable if it meets the following 3 criteria // 1: Reserved by NRP: When creating a nic and attempting to assign it to a vm, the nic will be reserved for that vm arm_resource_id for 180 seconds - // 2: Belongs to a Nodeclaim: If a nodeclaim exists in the cluster we shouldn't attempt removing it + // 2: Belongs to a Nodeclaim: If a nodeclaim exists in the cluster we shouldn't attempt removing it // 3: Belongs to VM: If the VM Garbage Collection controller is removing a vm, we should not attempt removing it in this controller, and delegate that responsibility to the vm gc controller since deleting a successfully provisioned vm has delete options to also clean up the associated nic - unremovableNics *cache.Cache - + unremovableNics *cache.Cache } -func NewNetworkInterfaceController(kubeClient client.Client, instanceProvider instance.Provider, unremovableNics *cache.Cache) *NetworkInterfaceController { +func NewNetworkInterfaceController(kubeClient client.Client, instanceProvider instance.Provider, unremovableNics *cache.Cache) *NetworkInterfaceController { return &NetworkInterfaceController{ - kubeClient: kubeClient, + kubeClient: kubeClient, instanceProvider: instanceProvider, - unremovableNics: unremovableNics, + unremovableNics: unremovableNics, } } func (c *NetworkInterfaceController) Reconcile(ctx context.Context) (reconcile.Result, error) { ctx = injection.WithControllerName(ctx, NICGCControllerName) - + nodeClaimList := &karpv1.NodeClaimList{} if err := c.kubeClient.List(ctx, nodeClaimList); err != nil { return reconcile.Result{}, fmt.Errorf("listing NodeClaims for NIC GC: %w", err) } - - + // List all NICs from the instance provider, this List call will give us network interfaces that belong to karpenter nics, err := c.instanceProvider.ListNics(ctx) if err != nil { return reconcile.Result{}, fmt.Errorf("listing NICs: %w", err) } - + // resourceNames is the resource representation for each nodeclaim - resourceNames := sets.New[string]() + resourceNames := sets.New[string]() for _, nodeClaim := range nodeClaimList.Items { // Adjust the prefix as per the aks naming convention resourceNames.Insert(fmt.Sprintf("aks-%s", nodeClaim.Name)) } errs := make([]error, len(nics)) - workqueue.ParallelizeUntil(ctx, 100, len(nics), func(i int){ + workqueue.ParallelizeUntil(ctx, 100, len(nics), func(i int) { nicName := lo.FromPtr(nics[i].Name) _, removableNic := c.unremovableNics.Get(nicName) noNodeclaimExistsForNIC := !resourceNames.Has(nicName) - // The networkInterface is unremovable if its - // A: Reserved by NRP + // The networkInterface is unremovable if its + // A: Reserved by NRP // B: Belongs to a Nodeclaim // C: Belongs to VM if noNodeclaimExistsForNIC && removableNic { - err := c.instanceProvider.DeleteNic(ctx, nicName) - if sdkerrors.IsNicReservedForAnotherVM(err) { + err := c.instanceProvider.DeleteNic(ctx, nicName) + if sdkerrors.IsNicReservedForAnotherVM(err) { c.unremovableNics.Set(nicName, sdkerrors.NicReservedForAnotherVM, NicReservationDuration) - return + return } - if err != nil { - errs[i] = err - return + if err != nil { + errs[i] = err + return } logging.FromContext(ctx).With("nic", nicName).Infof("garbage collected NIC") @@ -100,14 +109,14 @@ func (c *NetworkInterfaceController) Reconcile(ctx context.Context) (reconcile.R // requeue every 5 minutes, adjust for throttling? return reconcile.Result{ - Requeue: true, + Requeue: true, RequeueAfter: time.Minute * 5, }, nil } func (c *NetworkInterfaceController) Register(_ context.Context, m manager.Manager) error { - return controllerruntime.NewControllerManagedBy(m). - Named(NICGCControllerName). - WatchesRawSource(singleton.Source()). + return controllerruntime.NewControllerManagedBy(m). + Named(NICGCControllerName). + WatchesRawSource(singleton.Source()). Complete(singleton.AsReconciler(c)) } diff --git a/pkg/controllers/nodeclaim/garbagecollection/suite_test.go b/pkg/controllers/nodeclaim/garbagecollection/suite_test.go index 85d6e27e9..6385f7d3d 100644 --- a/pkg/controllers/nodeclaim/garbagecollection/suite_test.go +++ b/pkg/controllers/nodeclaim/garbagecollection/suite_test.go @@ -333,4 +333,3 @@ var _ = Describe("VirtualMachine Garbage Collection", func() { }) }) }) - diff --git a/pkg/fake/azureresourcegraphapi.go b/pkg/fake/azureresourcegraphapi.go index e82ee7859..39e7c29bc 100644 --- a/pkg/fake/azureresourcegraphapi.go +++ b/pkg/fake/azureresourcegraphapi.go @@ -36,7 +36,7 @@ type AzureResourceGraphResourcesInput struct { type AzureResourceGraphBehavior struct { AzureResourceGraphResourcesBehavior MockedFunction[AzureResourceGraphResourcesInput, armresourcegraph.ClientResourcesResponse] VirtualMachinesAPI *VirtualMachinesAPI - NetworkInterfacesAPI *NetworkInterfacesAPI + NetworkInterfacesAPI *NetworkInterfacesAPI ResourceGroup string } @@ -82,16 +82,14 @@ func (c *AzureResourceGraphAPI) getResourceList(query string) []interface{} { return nic.Tags != nil && nic.Tags[instance.NodePoolTagKey] != nil }) resourceList := lo.Map(nicList, func(nic armnetwork.Interface, _ int) interface{} { - b, _ := json.Marshal(nic) - return convertBytesToInterface(b) + b, _ := json.Marshal(nic) + return convertBytesToInterface(b) }) return resourceList } return nil } - - func (c *AzureResourceGraphAPI) loadVMObjects() (vmList []armcompute.VirtualMachine) { c.VirtualMachinesAPI.Instances.Range(func(k, v any) bool { vm, _ := c.VirtualMachinesAPI.Instances.Load(k) @@ -101,11 +99,10 @@ func (c *AzureResourceGraphAPI) loadVMObjects() (vmList []armcompute.VirtualMach return vmList } - func (c *AzureResourceGraphAPI) loadNicObjects() (nicList []armnetwork.Interface) { c.NetworkInterfacesAPI.NetworkInterfaces.Range(func(k, v any) bool { nic, _ := c.NetworkInterfacesAPI.NetworkInterfaces.Load(k) - nicList = append(nicList, nic.(armnetwork.Interface)) + nicList = append(nicList, nic.(armnetwork.Interface)) return true }) return nicList diff --git a/pkg/providers/instance/arglist.go b/pkg/providers/instance/arglist.go index c0d2366fe..fc41fd5f0 100644 --- a/pkg/providers/instance/arglist.go +++ b/pkg/providers/instance/arglist.go @@ -1,3 +1,19 @@ +/* +Portions Copyright (c) Microsoft Corporation. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package instance import ( diff --git a/pkg/providers/instance/instance.go b/pkg/providers/instance/instance.go index 1046bc557..cd58f51c3 100644 --- a/pkg/providers/instance/instance.go +++ b/pkg/providers/instance/instance.go @@ -227,11 +227,10 @@ func (p *DefaultProvider) ListNics(ctx context.Context) ([]*armnetwork.Interface return nicList, nil } -func(p *DefaultProvider) DeleteNic(ctx context.Context, nicName string) error { +func (p *DefaultProvider) DeleteNic(ctx context.Context, nicName string) error { return deleteNicIfExists(ctx, p.azClient.networkInterfacesClient, p.resourceGroup, nicName) } - // createAKSIdentifyingExtension attaches a VM extension to identify that this VM participates in an AKS cluster func (p *DefaultProvider) createAKSIdentifyingExtension(ctx context.Context, vmName string) (err error) { vmExt := p.getAKSIdentifyingExtension() @@ -674,10 +673,10 @@ func (p *DefaultProvider) cleanupAzureResources(ctx context.Context, resourceNam // The order here is intentional, if the VM was created successfully, then we attempt to delete the vm, the // nic, disk and all associated resources will be removed. If the VM was not created successfully and a nic was found, // then we attempt to delete the nic. - - nicErr := p.DeleteNic(ctx, resourceName) + + nicErr := p.DeleteNic(ctx, resourceName) if nicErr != nil { - logging.FromContext(ctx).Errorf("networkinterface.Delete for %s failed: %v", resourceName, nicErr) + logging.FromContext(ctx).Errorf("networkinterface.Delete for %s failed: %v", resourceName, nicErr) } return errors.Join(vmErr, nicErr) } diff --git a/pkg/providers/instance/suite_test.go b/pkg/providers/instance/suite_test.go index 48f95adc6..f79141428 100644 --- a/pkg/providers/instance/suite_test.go +++ b/pkg/providers/instance/suite_test.go @@ -215,7 +215,7 @@ var _ = Describe("InstanceProvider", func() { return strings.Contains(key, "/") // ARM tags can't contain '/' })).To(HaveLen(0)) }) - It("should list nic from karpenter provisioning request", func(){ + It("should list nic from karpenter provisioning request", func() { ExpectApplied(ctx, env.Client, nodePool, nodeClass) pod := coretest.UnschedulablePod(coretest.PodOptions{}) ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod) diff --git a/pkg/test/environment.go b/pkg/test/environment.go index ad610a7f6..b08eeb74f 100644 --- a/pkg/test/environment.go +++ b/pkg/test/environment.go @@ -92,7 +92,7 @@ func NewRegionalEnvironment(ctx context.Context, env *coretest.Environment, regi // API virtualMachinesAPI := &fake.VirtualMachinesAPI{} - + networkInterfacesAPI := &fake.NetworkInterfacesAPI{} virtualMachinesExtensionsAPI := &fake.VirtualMachineExtensionsAPI{} pricingAPI := &fake.PricingAPI{} @@ -103,9 +103,9 @@ func NewRegionalEnvironment(ctx context.Context, env *coretest.Environment, regi azureResourceGraphAPI := &fake.AzureResourceGraphAPI{ AzureResourceGraphBehavior: fake.AzureResourceGraphBehavior{ - VirtualMachinesAPI: virtualMachinesAPI, + VirtualMachinesAPI: virtualMachinesAPI, NetworkInterfacesAPI: networkInterfacesAPI, - ResourceGroup: resourceGroup, + ResourceGroup: resourceGroup, }} // Cache kubernetesVersionCache := cache.New(azurecache.KubernetesVersionTTL, azurecache.DefaultCleanupInterval)