Skip to content

Commit

Permalink
fix: bootstrap.sh
Browse files Browse the repository at this point in the history
  • Loading branch information
Bryce-Soghigian committed Jan 10, 2025
1 parent 17c6dc0 commit 454c0ad
Show file tree
Hide file tree
Showing 9 changed files with 83 additions and 65 deletions.
7 changes: 3 additions & 4 deletions pkg/controllers/controllers.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,17 +42,16 @@ import (

func NewControllers(ctx context.Context, mgr manager.Manager, kubeClient client.Client, recorder events.Recorder,
cloudProvider cloudprovider.CloudProvider, instanceProvider instance.Provider) []controller.Controller {

unremovableNics := cache.New(nodeclaimgarbagecollection.NicReservationDuration, time.Second * 30)
unremovableNics := cache.New(nodeclaimgarbagecollection.NicReservationDuration, time.Second*30)

controllers := []controller.Controller{
nodeclasshash.NewController(kubeClient),
nodeclassstatus.NewController(kubeClient),
nodeclasstermination.NewController(kubeClient, recorder),

// resources the instance provider creates are garbage collected by these controllers
nodeclaimgarbagecollection.NewVirtualMachineController(kubeClient, cloudProvider, unremovableNics),
nodeclaimgarbagecollection.NewNetworkInterfaceController(kubeClient, instanceProvider, unremovableNics),
nodeclaimgarbagecollection.NewNetworkInterfaceController(kubeClient, instanceProvider, unremovableNics),

// TODO: nodeclaim tagging
inplaceupdate.NewController(kubeClient, instanceProvider),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,16 +42,15 @@ import (
corecloudprovider "sigs.k8s.io/karpenter/pkg/cloudprovider"
)


const (
NicBelongsToVM = "NicBelongsToVM"
const (
NicBelongsToVM = "NicBelongsToVM"
)

type VirtualMachineController struct {
kubeClient client.Client
cloudProvider corecloudprovider.CloudProvider
unremovableNics *cache.Cache
successfulCount uint64 // keeps track of successful reconciles for more aggressive requeueing near the start of the controller
unremovableNics *cache.Cache
successfulCount uint64 // keeps track of successful reconciles for more aggressive requeuing near the start of the controller
}

func NewVirtualMachineController(kubeClient client.Client, cloudProvider corecloudprovider.CloudProvider, unremovableNics *cache.Cache) *VirtualMachineController {
Expand All @@ -74,11 +73,11 @@ func (c *VirtualMachineController) Reconcile(ctx context.Context) (reconcile.Res
return reconcile.Result{}, fmt.Errorf("listing cloudprovider VMs, %w", err)
}

// Mark all vms on the cloudprovider as unremovableNics for the nicGc controller
// Mark all vms on the cloudprovider as unremovableNics for the nicGc controller
for _, nodeClaim := range retrieved {
// Nics Belonging to a vm cannot be removed while attached to the vm.
// Nics Belonging to a vm cannot be removed while attached to the vm.
// lets set a nic as unremovable for 15 minutes if it belongs to a vm
c.unremovableNics.Set(instance.GenerateResourceName(nodeClaim.Name), "", time.Minute * 15)
c.unremovableNics.Set(instance.GenerateResourceName(nodeClaim.Name), "", time.Minute*15)
}

managedRetrieved := lo.Filter(retrieved, func(nc *karpv1.NodeClaim, _ int) bool {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,97 +1,106 @@
/*
Portions Copyright (c) Microsoft Corporation.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package garbagecollection

import (
"time"
"context"
"fmt"
"time"


"github.com/samber/lo"
"github.com/patrickmn/go-cache"
"github.com/samber/lo"
"knative.dev/pkg/logging"

karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1"
"github.com/awslabs/operatorpkg/singleton"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/client-go/util/workqueue"
controllerruntime "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1"
"sigs.k8s.io/karpenter/pkg/operator/injection"


"github.com/Azure/karpenter-provider-azure/pkg/providers/instance"
sdkerrors "github.com/Azure/azure-sdk-for-go-extensions/pkg/errors"
"github.com/Azure/karpenter-provider-azure/pkg/providers/instance"
)


const (
NICGCControllerName = "networkinterface.garbagecollection"
NICGCControllerName = "networkinterface.garbagecollection"
NicReservationDuration = time.Second * 180
)



type NetworkInterfaceController struct {
kubeClient client.Client
kubeClient client.Client
instanceProvider instance.Provider
// A networkInterface is considered unremovable if it meets the following 3 criteria
// A networkInterface is considered unremovable if it meets the following 3 criteria
// 1: Reserved by NRP: When creating a nic and attempting to assign it to a vm, the nic will be reserved for that vm arm_resource_id for 180 seconds
// 2: Belongs to a Nodeclaim: If a nodeclaim exists in the cluster we shouldn't attempt removing it
// 2: Belongs to a Nodeclaim: If a nodeclaim exists in the cluster we shouldn't attempt removing it
// 3: Belongs to VM: If the VM Garbage Collection controller is removing a vm, we should not attempt removing it in this controller, and delegate that responsibility to the vm gc controller since deleting a successfully provisioned vm has delete options to also clean up the associated nic
unremovableNics *cache.Cache

unremovableNics *cache.Cache
}

func NewNetworkInterfaceController(kubeClient client.Client, instanceProvider instance.Provider, unremovableNics *cache.Cache) *NetworkInterfaceController {
func NewNetworkInterfaceController(kubeClient client.Client, instanceProvider instance.Provider, unremovableNics *cache.Cache) *NetworkInterfaceController {
return &NetworkInterfaceController{
kubeClient: kubeClient,
kubeClient: kubeClient,
instanceProvider: instanceProvider,
unremovableNics: unremovableNics,
unremovableNics: unremovableNics,
}
}

func (c *NetworkInterfaceController) Reconcile(ctx context.Context) (reconcile.Result, error) {
ctx = injection.WithControllerName(ctx, NICGCControllerName)

nodeClaimList := &karpv1.NodeClaimList{}
if err := c.kubeClient.List(ctx, nodeClaimList); err != nil {
return reconcile.Result{}, fmt.Errorf("listing NodeClaims for NIC GC: %w", err)
}



// List all NICs from the instance provider, this List call will give us network interfaces that belong to karpenter
nics, err := c.instanceProvider.ListNics(ctx)
if err != nil {
return reconcile.Result{}, fmt.Errorf("listing NICs: %w", err)
}

// resourceNames is the resource representation for each nodeclaim
resourceNames := sets.New[string]()
resourceNames := sets.New[string]()
for _, nodeClaim := range nodeClaimList.Items {
// Adjust the prefix as per the aks naming convention
resourceNames.Insert(fmt.Sprintf("aks-%s", nodeClaim.Name))
}

errs := make([]error, len(nics))
workqueue.ParallelizeUntil(ctx, 100, len(nics), func(i int){
workqueue.ParallelizeUntil(ctx, 100, len(nics), func(i int) {
nicName := lo.FromPtr(nics[i].Name)
_, removableNic := c.unremovableNics.Get(nicName)
noNodeclaimExistsForNIC := !resourceNames.Has(nicName)
// The networkInterface is unremovable if its
// A: Reserved by NRP
// The networkInterface is unremovable if its
// A: Reserved by NRP
// B: Belongs to a Nodeclaim
// C: Belongs to VM
if noNodeclaimExistsForNIC && removableNic {
err := c.instanceProvider.DeleteNic(ctx, nicName)
if sdkerrors.IsNicReservedForAnotherVM(err) {
err := c.instanceProvider.DeleteNic(ctx, nicName)
if sdkerrors.IsNicReservedForAnotherVM(err) {
c.unremovableNics.Set(nicName, sdkerrors.NicReservedForAnotherVM, NicReservationDuration)
return
return
}
if err != nil {
errs[i] = err
return
if err != nil {
errs[i] = err
return
}

logging.FromContext(ctx).With("nic", nicName).Infof("garbage collected NIC")
Expand All @@ -100,14 +109,14 @@ func (c *NetworkInterfaceController) Reconcile(ctx context.Context) (reconcile.R

// requeue every 5 minutes, adjust for throttling?
return reconcile.Result{
Requeue: true,
Requeue: true,
RequeueAfter: time.Minute * 5,
}, nil
}

func (c *NetworkInterfaceController) Register(_ context.Context, m manager.Manager) error {
return controllerruntime.NewControllerManagedBy(m).
Named(NICGCControllerName).
WatchesRawSource(singleton.Source()).
return controllerruntime.NewControllerManagedBy(m).
Named(NICGCControllerName).
WatchesRawSource(singleton.Source()).
Complete(singleton.AsReconciler(c))
}
1 change: 0 additions & 1 deletion pkg/controllers/nodeclaim/garbagecollection/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -333,4 +333,3 @@ var _ = Describe("VirtualMachine Garbage Collection", func() {
})
})
})

11 changes: 4 additions & 7 deletions pkg/fake/azureresourcegraphapi.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ type AzureResourceGraphResourcesInput struct {
type AzureResourceGraphBehavior struct {
AzureResourceGraphResourcesBehavior MockedFunction[AzureResourceGraphResourcesInput, armresourcegraph.ClientResourcesResponse]
VirtualMachinesAPI *VirtualMachinesAPI
NetworkInterfacesAPI *NetworkInterfacesAPI
NetworkInterfacesAPI *NetworkInterfacesAPI
ResourceGroup string
}

Expand Down Expand Up @@ -82,16 +82,14 @@ func (c *AzureResourceGraphAPI) getResourceList(query string) []interface{} {
return nic.Tags != nil && nic.Tags[instance.NodePoolTagKey] != nil
})
resourceList := lo.Map(nicList, func(nic armnetwork.Interface, _ int) interface{} {
b, _ := json.Marshal(nic)
return convertBytesToInterface(b)
b, _ := json.Marshal(nic)
return convertBytesToInterface(b)
})
return resourceList
}
return nil
}



func (c *AzureResourceGraphAPI) loadVMObjects() (vmList []armcompute.VirtualMachine) {
c.VirtualMachinesAPI.Instances.Range(func(k, v any) bool {
vm, _ := c.VirtualMachinesAPI.Instances.Load(k)
Expand All @@ -101,11 +99,10 @@ func (c *AzureResourceGraphAPI) loadVMObjects() (vmList []armcompute.VirtualMach
return vmList
}


func (c *AzureResourceGraphAPI) loadNicObjects() (nicList []armnetwork.Interface) {
c.NetworkInterfacesAPI.NetworkInterfaces.Range(func(k, v any) bool {
nic, _ := c.NetworkInterfacesAPI.NetworkInterfaces.Load(k)
nicList = append(nicList, nic.(armnetwork.Interface))
nicList = append(nicList, nic.(armnetwork.Interface))
return true
})
return nicList
Expand Down
16 changes: 16 additions & 0 deletions pkg/providers/instance/arglist.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
/*
Portions Copyright (c) Microsoft Corporation.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package instance

import (
Expand Down
9 changes: 4 additions & 5 deletions pkg/providers/instance/instance.go
Original file line number Diff line number Diff line change
Expand Up @@ -227,11 +227,10 @@ func (p *DefaultProvider) ListNics(ctx context.Context) ([]*armnetwork.Interface
return nicList, nil
}

func(p *DefaultProvider) DeleteNic(ctx context.Context, nicName string) error {
func (p *DefaultProvider) DeleteNic(ctx context.Context, nicName string) error {
return deleteNicIfExists(ctx, p.azClient.networkInterfacesClient, p.resourceGroup, nicName)
}


// createAKSIdentifyingExtension attaches a VM extension to identify that this VM participates in an AKS cluster
func (p *DefaultProvider) createAKSIdentifyingExtension(ctx context.Context, vmName string) (err error) {
vmExt := p.getAKSIdentifyingExtension()
Expand Down Expand Up @@ -674,10 +673,10 @@ func (p *DefaultProvider) cleanupAzureResources(ctx context.Context, resourceNam
// The order here is intentional, if the VM was created successfully, then we attempt to delete the vm, the
// nic, disk and all associated resources will be removed. If the VM was not created successfully and a nic was found,
// then we attempt to delete the nic.
nicErr := p.DeleteNic(ctx, resourceName)

nicErr := p.DeleteNic(ctx, resourceName)
if nicErr != nil {
logging.FromContext(ctx).Errorf("networkinterface.Delete for %s failed: %v", resourceName, nicErr)
logging.FromContext(ctx).Errorf("networkinterface.Delete for %s failed: %v", resourceName, nicErr)
}
return errors.Join(vmErr, nicErr)
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/providers/instance/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ var _ = Describe("InstanceProvider", func() {
return strings.Contains(key, "/") // ARM tags can't contain '/'
})).To(HaveLen(0))
})
It("should list nic from karpenter provisioning request", func(){
It("should list nic from karpenter provisioning request", func() {
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
pod := coretest.UnschedulablePod(coretest.PodOptions{})
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
Expand Down
6 changes: 3 additions & 3 deletions pkg/test/environment.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ func NewRegionalEnvironment(ctx context.Context, env *coretest.Environment, regi

// API
virtualMachinesAPI := &fake.VirtualMachinesAPI{}

networkInterfacesAPI := &fake.NetworkInterfacesAPI{}
virtualMachinesExtensionsAPI := &fake.VirtualMachineExtensionsAPI{}
pricingAPI := &fake.PricingAPI{}
Expand All @@ -103,9 +103,9 @@ func NewRegionalEnvironment(ctx context.Context, env *coretest.Environment, regi

azureResourceGraphAPI := &fake.AzureResourceGraphAPI{
AzureResourceGraphBehavior: fake.AzureResourceGraphBehavior{
VirtualMachinesAPI: virtualMachinesAPI,
VirtualMachinesAPI: virtualMachinesAPI,
NetworkInterfacesAPI: networkInterfacesAPI,
ResourceGroup: resourceGroup,
ResourceGroup: resourceGroup,
}}
// Cache
kubernetesVersionCache := cache.New(azurecache.KubernetesVersionTTL, azurecache.DefaultCleanupInterval)
Expand Down

0 comments on commit 454c0ad

Please sign in to comment.