Skip to content

Commit

Permalink
Merge branch 'main' into tallaxes/e2e/drift
Browse files Browse the repository at this point in the history
  • Loading branch information
Bryce-Soghigian authored Feb 7, 2025
2 parents dd26f5e + ea76305 commit 1811f63
Show file tree
Hide file tree
Showing 33 changed files with 809 additions and 144 deletions.
8 changes: 2 additions & 6 deletions Makefile-az.mk
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ az-all-cniv1: az-login az-create-workload-msi az-mkaks-cniv1 az-cre

az-all-cni-overlay: az-login az-create-workload-msi az-mkaks-overlay az-create-federated-cred az-perm az-perm-acr az-configure-values az-build az-run az-run-sample ## Provision the infra (ACR,AKS); build and deploy Karpenter; deploy sample Provisioner and workload

az-all-custom-vnet: az-login az-create-workload-msi az-mkaks-custom-vnet az-create-federated-cred az-perm-subnet-custom az-perm-acr az-configure-values-custom-vnet az-build az-run az-run-sample ## Provision the infra (ACR,AKS); build and deploy Karpenter; deploy sample Provisioner and workload
az-all-custom-vnet: az-login az-create-workload-msi az-mkaks-custom-vnet az-create-federated-cred az-perm-subnet-custom az-perm-acr az-configure-values az-build az-run az-run-sample ## Provision the infra (ACR,AKS); build and deploy Karpenter; deploy sample Provisioner and workload
az-all-user: az-login az-mkaks-user az-configure-values az-helm-install-snapshot az-run-sample ## Provision the cluster and deploy Karpenter snapshot release
# TODO: az-all-savm case is not currently built to support workload identity, need to re-evaluate
az-all-savm: az-login az-mkaks-savm az-perm-savm az-configure-values az-build az-run az-run-sample ## Provision the infra (ACR,AKS); build and deploy Karpenter; deploy sample Provisioner and workload - StandaloneVirtualMachines
Expand Down Expand Up @@ -116,11 +116,7 @@ az-rmrg: ## Destroy test ACR and AKS cluster by deleting the resource group (use
az group delete --name $(AZURE_RESOURCE_GROUP)

az-configure-values: ## Generate cluster-related values for Karpenter Helm chart
hack/deploy/configure-values.sh $(AZURE_CLUSTER_NAME) $(AZURE_RESOURCE_GROUP) $(KARPENTER_SERVICE_ACCOUNT_NAME) $(AZURE_KARPENTER_USER_ASSIGNED_IDENTITY_NAME)

az-configure-values-custom-vnet: ## Generate cluster-related values for Karpenter Helm chart (take custom subnet ID from first agentpool)
VNET_SUBNET_ID=$(shell az aks show --name $(AZURE_CLUSTER_NAME) --resource-group $(AZURE_RESOURCE_GROUP) | jq -r ".agentPoolProfiles[0].vnetSubnetId") \
$(MAKE) az-configure-values
hack/deploy/configure-values.sh $(AZURE_CLUSTER_NAME) $(AZURE_RESOURCE_GROUP) $(KARPENTER_SERVICE_ACCOUNT_NAME) $(AZURE_KARPENTER_USER_ASSIGNED_IDENTITY_NAME)

az-mkvmssflex: ## Create VMSS Flex (optional, only if creating VMs referencing this VMSS)
az vmss create --name $(AZURE_CLUSTER_NAME)-vmss --resource-group $(AZURE_RESOURCE_GROUP_MC) --location $(AZURE_LOCATION) \
Expand Down
2 changes: 2 additions & 0 deletions charts/karpenter-crd/templates/karpenter.sh_nodeclaims.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -583,6 +583,8 @@ spec:
rule: self in ["karpenter.sh/capacity-type", "karpenter.sh/nodepool"] || !self.find("^([^/]+)").endsWith("karpenter.sh")
- message: label "kubernetes.io/hostname" is restricted
rule: self != "kubernetes.io/hostname"
- message: label domain "karpenter.azure.com" is restricted
rule: self in [ "karpenter.azure.com/sku-name", "karpenter.azure.com/sku-family", "karpenter.azure.com/sku-version", "karpenter.azure.com/sku-cpu", "karpenter.azure.com/sku-memory", "karpenter.azure.com/sku-accelerator", "karpenter.azure.com/sku-networking-accelerated", "karpenter.azure.com/sku-storage-premium-capable", "karpenter.azure.com/sku-storage-ephemeralos-maxsize", "karpenter.azure.com/sku-encryptionathost-capable", "karpenter.azure.com/sku-gpu-name", "karpenter.azure.com/sku-gpu-manufacturer", "karpenter.azure.com/sku-gpu-count" ] || !self.find("^([^/]+)").endsWith("karpenter.azure.com")
minValues:
description: |-
This field is ALPHA and can be dropped or replaced at any time
Expand Down
4 changes: 4 additions & 0 deletions charts/karpenter-crd/templates/karpenter.sh_nodepools.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -676,6 +676,8 @@ spec:
rule: self.all(x, x != "karpenter.sh/nodepool")
- message: label "kubernetes.io/hostname" is restricted
rule: self.all(x, x != "kubernetes.io/hostname")
- message: label domain "karpenter.azure.com" is restricted
rule: self.all(x, x in [ "karpenter.azure.com/sku-name", "karpenter.azure.com/sku-family", "karpenter.azure.com/sku-version", "karpenter.azure.com/sku-cpu", "karpenter.azure.com/sku-memory", "karpenter.azure.com/sku-accelerator", "karpenter.azure.com/sku-networking-accelerated", "karpenter.azure.com/sku-storage-premium-capable", "karpenter.azure.com/sku-storage-ephemeralos-maxsize", "karpenter.azure.com/sku-encryptionathost-capable", "karpenter.azure.com/sku-gpu-name", "karpenter.azure.com/sku-gpu-manufacturer", "karpenter.azure.com/sku-gpu-count" ] || !x.find("^([^/]+)").endsWith("karpenter.azure.com"))
type: object
spec:
description: NodeClaimSpec describes the desired state of the NodeClaim
Expand Down Expand Up @@ -832,6 +834,8 @@ spec:
rule: self != "karpenter.sh/nodepool"
- message: label "kubernetes.io/hostname" is restricted
rule: self != "kubernetes.io/hostname"
- message: label domain "karpenter.azure.com" is restricted
rule: self in [ "karpenter.azure.com/sku-name", "karpenter.azure.com/sku-family", "karpenter.azure.com/sku-version", "karpenter.azure.com/sku-cpu", "karpenter.azure.com/sku-memory", "karpenter.azure.com/sku-accelerator", "karpenter.azure.com/sku-networking-accelerated", "karpenter.azure.com/sku-storage-premium-capable", "karpenter.azure.com/sku-storage-ephemeralos-maxsize", "karpenter.azure.com/sku-encryptionathost-capable", "karpenter.azure.com/sku-gpu-name", "karpenter.azure.com/sku-gpu-manufacturer", "karpenter.azure.com/sku-gpu-count" ] || !self.find("^([^/]+)").endsWith("karpenter.azure.com")
minValues:
description: |-
This field is ALPHA and can be dropped or replaced at any time
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
module github.com/Azure/karpenter-provider-azure

go 1.23.0
go 1.23.6

require (
github.com/Azure/azure-kusto-go v0.16.1
Expand Down
32 changes: 25 additions & 7 deletions hack/deploy/configure-values.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/usr/bin/env bash
set -euo pipefail

# This script interrogates the AKS cluster and Azure resources to generate
# the karpenter-values.yaml file using the karpenter-values-template.yaml file as a template.

Expand Down Expand Up @@ -33,11 +32,30 @@ BOOTSTRAP_TOKEN=$TOKEN_ID.$TOKEN_SECRET

SSH_PUBLIC_KEY="$(cat ~/.ssh/id_rsa.pub) azureuser"

if [[ ! -v VNET_SUBNET_ID ]]; then
# first subnet of first VNet found
VNET_JSON=$(az network vnet list --resource-group "$AZURE_RESOURCE_GROUP_MC" | jq -r ".[0]")
VNET_SUBNET_ID=$(jq -r ".subnets[0].id" <<< "$VNET_JSON")
fi

get_vnet_json() {
local resource_group=$1
local aks_json=$2

local vnet_json
vnet_json=$(az network vnet list --resource-group "$resource_group" | jq -r ".[0]")

if [[ -z "$vnet_json" || "$vnet_json" == "null" ]]; then
local subnet_id
subnet_id=$(jq -r ".agentPoolProfiles[0].vnetSubnetId" <<< "$aks_json")
local vnet_id
vnet_id=$(echo "$subnet_id" | sed 's|/subnets/[^/]*$||')
vnet_json=$(az network vnet show --ids "$vnet_id")
fi

echo "$vnet_json"
}

# Retrieve VNET JSON
VNET_JSON=$(get_vnet_json "$AZURE_RESOURCE_GROUP_MC" "$AKS_JSON")
# Extract all properties from vnet json
VNET_SUBNET_ID=$(jq -r ".subnets[0].id" <<< "$VNET_JSON")
VNET_GUID=$(jq -r ".resourceGuid // empty" <<< "$VNET_JSON")

# The // empty ensures that if the files is 'null' or not prsent jq will output nothing
# If the value returned is none, its from jq and not the aks api in this case we return ""
Expand All @@ -51,7 +69,7 @@ KARPENTER_USER_ASSIGNED_CLIENT_ID=$(az identity show --resource-group "${AZURE_R

export CLUSTER_NAME AZURE_LOCATION AZURE_RESOURCE_GROUP_MC KARPENTER_SERVICE_ACCOUNT_NAME \
CLUSTER_ENDPOINT BOOTSTRAP_TOKEN SSH_PUBLIC_KEY VNET_SUBNET_ID KARPENTER_USER_ASSIGNED_CLIENT_ID NODE_IDENTITIES AZURE_SUBSCRIPTION_ID NETWORK_PLUGIN NETWORK_PLUGIN_MODE NETWORK_POLICY \
LOG_LEVEL
LOG_LEVEL VNET_GUID

# get karpenter-values-template.yaml, if not already present (e.g. outside of repo context)
if [ ! -f karpenter-values-template.yaml ]; then
Expand Down
10 changes: 10 additions & 0 deletions hack/validation/labels.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,16 @@ rule=${rule//\"/\\\"} # escape double quotes
rule=${rule//$'\n'/} # remove newlines
rule=$(echo "$rule" | tr -s ' ') # remove extra spaces

# check that .spec.versions has 2 entries
[[ $(yq e '.spec.versions | length' pkg/apis/crds/karpenter.sh_nodepools.yaml) -eq 2 ]] || { echo "expected two versions"; exit 1; }

# nodepool
# v1beta1
printf -v expr '.spec.versions[1].schema.openAPIV3Schema.properties.spec.properties.template.properties.metadata.properties.labels.x-kubernetes-validations +=
[{"message": "label domain \\"karpenter.azure.com\\" is restricted", "rule": "%s"}]' "$rule"
yq eval "${expr}" -i pkg/apis/crds/karpenter.sh_nodepools.yaml

# v1
# nodepool
printf -v expr '.spec.versions[0].schema.openAPIV3Schema.properties.spec.properties.template.properties.metadata.properties.labels.x-kubernetes-validations +=
[{"message": "label domain \\"karpenter.azure.com\\" is restricted", "rule": "%s"}]' "$rule"
Expand Down
16 changes: 16 additions & 0 deletions hack/validation/requirements.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,22 @@ rule=${rule//\"/\\\"} # escape double quotes
rule=${rule//$'\n'/} # remove newlines
rule=$(echo "$rule" | tr -s ' ') # remove extra spaces

# check that .spec.versions has 2 entries
[[ $(yq e '.spec.versions | length' pkg/apis/crds/karpenter.sh_nodepools.yaml) -eq 2 ]] || { echo "expected two versions"; exit 1; }
[[ $(yq e '.spec.versions | length' pkg/apis/crds/karpenter.sh_nodeclaims.yaml) -eq 2 ]] || { echo "expected two versions"; exit 1; }

# v1beta1
# nodeclaim
printf -v expr '.spec.versions[1].schema.openAPIV3Schema.properties.spec.properties.requirements.items.properties.key.x-kubernetes-validations +=
[{"message": "label domain \\"karpenter.azure.com\\" is restricted", "rule": "%s"}]' "$rule"
yq eval "${expr}" -i pkg/apis/crds/karpenter.sh_nodeclaims.yaml

# nodepool
printf -v expr '.spec.versions[1].schema.openAPIV3Schema.properties.spec.properties.template.properties.spec.properties.requirements.items.properties.key.x-kubernetes-validations +=
[{"message": "label domain \\"karpenter.azure.com\\" is restricted", "rule": "%s"}]' "$rule"
yq eval "${expr}" -i pkg/apis/crds/karpenter.sh_nodepools.yaml

# v1
# nodeclaim
printf -v expr '.spec.versions[0].schema.openAPIV3Schema.properties.spec.properties.requirements.items.properties.key.x-kubernetes-validations +=
[{"message": "label domain \\"karpenter.azure.com\\" is restricted", "rule": "%s"}]' "$rule"
Expand Down
2 changes: 2 additions & 0 deletions karpenter-values-template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ controller:
value: ${NETWORK_POLICY}
- name: VNET_SUBNET_ID
value: ${VNET_SUBNET_ID}
- name: VNET_GUID
value: ${VNET_GUID}
- name: NODE_IDENTITIES
value: ${NODE_IDENTITIES}

Expand Down
2 changes: 2 additions & 0 deletions pkg/apis/crds/karpenter.sh_nodeclaims.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -583,6 +583,8 @@ spec:
rule: self in ["karpenter.sh/capacity-type", "karpenter.sh/nodepool"] || !self.find("^([^/]+)").endsWith("karpenter.sh")
- message: label "kubernetes.io/hostname" is restricted
rule: self != "kubernetes.io/hostname"
- message: label domain "karpenter.azure.com" is restricted
rule: self in [ "karpenter.azure.com/sku-name", "karpenter.azure.com/sku-family", "karpenter.azure.com/sku-version", "karpenter.azure.com/sku-cpu", "karpenter.azure.com/sku-memory", "karpenter.azure.com/sku-accelerator", "karpenter.azure.com/sku-networking-accelerated", "karpenter.azure.com/sku-storage-premium-capable", "karpenter.azure.com/sku-storage-ephemeralos-maxsize", "karpenter.azure.com/sku-encryptionathost-capable", "karpenter.azure.com/sku-gpu-name", "karpenter.azure.com/sku-gpu-manufacturer", "karpenter.azure.com/sku-gpu-count" ] || !self.find("^([^/]+)").endsWith("karpenter.azure.com")
minValues:
description: |-
This field is ALPHA and can be dropped or replaced at any time
Expand Down
4 changes: 4 additions & 0 deletions pkg/apis/crds/karpenter.sh_nodepools.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -676,6 +676,8 @@ spec:
rule: self.all(x, x != "karpenter.sh/nodepool")
- message: label "kubernetes.io/hostname" is restricted
rule: self.all(x, x != "kubernetes.io/hostname")
- message: label domain "karpenter.azure.com" is restricted
rule: self.all(x, x in [ "karpenter.azure.com/sku-name", "karpenter.azure.com/sku-family", "karpenter.azure.com/sku-version", "karpenter.azure.com/sku-cpu", "karpenter.azure.com/sku-memory", "karpenter.azure.com/sku-accelerator", "karpenter.azure.com/sku-networking-accelerated", "karpenter.azure.com/sku-storage-premium-capable", "karpenter.azure.com/sku-storage-ephemeralos-maxsize", "karpenter.azure.com/sku-encryptionathost-capable", "karpenter.azure.com/sku-gpu-name", "karpenter.azure.com/sku-gpu-manufacturer", "karpenter.azure.com/sku-gpu-count" ] || !x.find("^([^/]+)").endsWith("karpenter.azure.com"))
type: object
spec:
description: NodeClaimSpec describes the desired state of the NodeClaim
Expand Down Expand Up @@ -832,6 +834,8 @@ spec:
rule: self != "karpenter.sh/nodepool"
- message: label "kubernetes.io/hostname" is restricted
rule: self != "kubernetes.io/hostname"
- message: label domain "karpenter.azure.com" is restricted
rule: self in [ "karpenter.azure.com/sku-name", "karpenter.azure.com/sku-family", "karpenter.azure.com/sku-version", "karpenter.azure.com/sku-cpu", "karpenter.azure.com/sku-memory", "karpenter.azure.com/sku-accelerator", "karpenter.azure.com/sku-networking-accelerated", "karpenter.azure.com/sku-storage-premium-capable", "karpenter.azure.com/sku-storage-ephemeralos-maxsize", "karpenter.azure.com/sku-encryptionathost-capable", "karpenter.azure.com/sku-gpu-name", "karpenter.azure.com/sku-gpu-manufacturer", "karpenter.azure.com/sku-gpu-count" ] || !self.find("^([^/]+)").endsWith("karpenter.azure.com")
minValues:
description: |-
This field is ALPHA and can be dropped or replaced at any time
Expand Down
2 changes: 1 addition & 1 deletion pkg/cloudprovider/cloudprovider.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ func (c *CloudProvider) List(ctx context.Context) ([]*karpv1.NodeClaim, error) {
if err != nil {
return nil, fmt.Errorf("listing instances, %w", err)
}

var nodeClaims []*karpv1.NodeClaim
for _, instance := range instances {
instanceType, err := c.resolveInstanceTypeFromInstance(ctx, instance)
Expand Down Expand Up @@ -337,7 +338,6 @@ func (c *CloudProvider) instanceToNodeClaim(ctx context.Context, vm *armcompute.

labels[karpv1.CapacityTypeLabelKey] = instance.GetCapacityType(vm)

// TODO: v1beta1 new kes/labels
if tag, ok := vm.Tags[instance.NodePoolTagKey]; ok {
labels[karpv1.NodePoolLabelKey] = *tag
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/cloudprovider/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ var _ = Describe("CloudProvider", func() {
nodeClaims, _ := cloudProvider.List(ctx)
Expect(azureEnv.AzureResourceGraphAPI.AzureResourceGraphResourcesBehavior.CalledWithInput.Len()).To(Equal(1))
queryRequest := azureEnv.AzureResourceGraphAPI.AzureResourceGraphResourcesBehavior.CalledWithInput.Pop().Query
Expect(*queryRequest.Query).To(Equal(instance.GetListQueryBuilder(azureEnv.AzureResourceGraphAPI.ResourceGroup).String()))
Expect(*queryRequest.Query).To(Equal(instance.GetVMListQueryBuilder(azureEnv.AzureResourceGraphAPI.ResourceGroup).String()))
Expect(nodeClaims).To(HaveLen(1))
Expect(nodeClaims[0]).ToNot(BeNil())
resp, _ := azureEnv.VirtualMachinesAPI.Get(ctx, azureEnv.AzureResourceGraphAPI.ResourceGroup, nodeClaims[0].Name, nil)
Expand Down
5 changes: 4 additions & 1 deletion pkg/controllers/controllers.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,10 @@ func NewControllers(ctx context.Context, mgr manager.Manager, kubeClient client.
nodeclasshash.NewController(kubeClient),
nodeclassstatus.NewController(kubeClient),
nodeclasstermination.NewController(kubeClient, recorder),
nodeclaimgarbagecollection.NewController(kubeClient, cloudProvider),

nodeclaimgarbagecollection.NewVirtualMachine(kubeClient, cloudProvider),
nodeclaimgarbagecollection.NewNetworkInterface(kubeClient, instanceProvider),

// TODO: nodeclaim tagging
inplaceupdate.NewController(kubeClient, instanceProvider),
status.NewController[*v1alpha2.AKSNodeClass](kubeClient, mgr.GetEventRecorderFor("karpenter")),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ import (

"github.com/awslabs/operatorpkg/singleton"

// "github.com/Azure/karpenter-provider-azure/pkg/cloudprovider"
"github.com/samber/lo"
"go.uber.org/multierr"
v1 "k8s.io/api/core/v1"
Expand All @@ -41,21 +40,21 @@ import (
corecloudprovider "sigs.k8s.io/karpenter/pkg/cloudprovider"
)

type Controller struct {
type VirtualMachine struct {
kubeClient client.Client
cloudProvider corecloudprovider.CloudProvider
successfulCount uint64 // keeps track of successful reconciles for more aggressive requeueing near the start of the controller
successfulCount uint64 // keeps track of successful reconciles for more aggressive requeuing near the start of the controller
}

func NewController(kubeClient client.Client, cloudProvider corecloudprovider.CloudProvider) *Controller {
return &Controller{
func NewVirtualMachine(kubeClient client.Client, cloudProvider corecloudprovider.CloudProvider) *VirtualMachine {
return &VirtualMachine{
kubeClient: kubeClient,
cloudProvider: cloudProvider,
successfulCount: 0,
}
}

func (c *Controller) Reconcile(ctx context.Context) (reconcile.Result, error) {
func (c *VirtualMachine) Reconcile(ctx context.Context) (reconcile.Result, error) {
ctx = injection.WithControllerName(ctx, "instance.garbagecollection")

// We LIST VMs on the CloudProvider BEFORE we grab NodeClaims/Nodes on the cluster so that we make sure that, if
Expand All @@ -65,6 +64,7 @@ func (c *Controller) Reconcile(ctx context.Context) (reconcile.Result, error) {
if err != nil {
return reconcile.Result{}, fmt.Errorf("listing cloudprovider VMs, %w", err)
}

managedRetrieved := lo.Filter(retrieved, func(nc *karpv1.NodeClaim, _ int) bool {
return nc.DeletionTimestamp.IsZero()
})
Expand Down Expand Up @@ -93,7 +93,7 @@ func (c *Controller) Reconcile(ctx context.Context) (reconcile.Result, error) {
return reconcile.Result{RequeueAfter: lo.Ternary(c.successfulCount <= 20, time.Second*10, time.Minute*2)}, nil
}

func (c *Controller) garbageCollect(ctx context.Context, nodeClaim *karpv1.NodeClaim, nodeList *v1.NodeList) error {
func (c *VirtualMachine) garbageCollect(ctx context.Context, nodeClaim *karpv1.NodeClaim, nodeList *v1.NodeList) error {
ctx = logging.WithLogger(ctx, logging.FromContext(ctx).With("provider-id", nodeClaim.Status.ProviderID))
if err := c.cloudProvider.Delete(ctx, nodeClaim); err != nil {
return corecloudprovider.IgnoreNodeClaimNotFoundError(err)
Expand All @@ -112,7 +112,7 @@ func (c *Controller) garbageCollect(ctx context.Context, nodeClaim *karpv1.NodeC
return nil
}

func (c *Controller) Register(_ context.Context, m manager.Manager) error {
func (c *VirtualMachine) Register(_ context.Context, m manager.Manager) error {
return controllerruntime.NewControllerManagedBy(m).
Named("instance.garbagecollection").
WatchesRawSource(singleton.Source()).
Expand Down
Loading

0 comments on commit 1811f63

Please sign in to comment.