Skip to content

Commit

Permalink
Merge branch 'release-v0.6.x' into tallaxes/release-v0.6.x/zones
Browse files Browse the repository at this point in the history
  • Loading branch information
tallaxes authored Jan 22, 2025
2 parents 9817d90 + e1f6e69 commit ff53ecc
Show file tree
Hide file tree
Showing 18 changed files with 477 additions and 199 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ repos:
hooks:
- id: shellcheck
- repo: https://github.com/crate-ci/typos
rev: v1.17.2
rev: v1.28.1
hooks:
- id: typos
- repo: https://github.com/pre-commit/pre-commit-hooks
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ require (
github.com/stretchr/testify v1.9.0
go.uber.org/multierr v1.11.0
go.uber.org/zap v1.27.0
gopkg.in/yaml.v2 v2.4.0
k8s.io/api v0.29.3
k8s.io/apiextensions-apiserver v0.29.3
k8s.io/apimachinery v0.29.3
Expand Down Expand Up @@ -146,7 +147,6 @@ require (
google.golang.org/protobuf v1.34.2 // indirect
gopkg.in/dnaeon/go-vcr.v3 v3.2.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/cloud-provider v0.29.3 // indirect
k8s.io/component-base v0.29.3 // indirect
Expand Down
2 changes: 1 addition & 1 deletion hack/codegen.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ skugen() {
NO_UPDATE=" pkg/fake/zz_generated.sku.$location.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)"
SUBJECT="SKUGEN"

go run hack/code/instancetype_testdata_gen/main.go -- "${GENERATED_FILE}" "$location" "Standard_B1s,Standard_A0,Standard_D2_v2,Standard_D2_v3,Standard_DS2_v2,Standard_D2s_v3,Standard_D2_v5,Standard_D16plds_v5,Standard_F16s_v2,Standard_NC6s,Standard_NC6s_v3,Standard_NC16as_T4_v3,Standard_NC24ads_A100_v4,Standard_M8-2ms,Standard_D4s_v3,Standard_D64s_v3,Standard_DC8s_v3"
go run hack/code/instancetype_testdata_gen/main.go -- "${GENERATED_FILE}" "$location" "Standard_B1s,Standard_A0,Standard_D2_v2,Standard_D2_v3,Standard_DS2_v2,Standard_D2s_v3,Standard_D2_v5,Standard_D16plds_v5,Standard_F16s_v2,Standard_NC6s,Standard_NC6s_v3,Standard_NC16as_T4_v3,Standard_NC24ads_A100_v4,Standard_M8-2ms,Standard_D4s_v3,Standard_D64s_v3,Standard_DC8s_v3,Standard_D2as_v6"
go fmt "${GENERATED_FILE}"

GIT_DIFF=$(git diff --stat "${GENERATED_FILE}")
Expand Down
150 changes: 112 additions & 38 deletions pkg/fake/zz_generated.sku.eastus.go

Large diffs are not rendered by default.

134 changes: 116 additions & 18 deletions pkg/fake/zz_generated.sku.westcentralus.go

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions pkg/providers/imagefamily/azlinux.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,9 @@ func (u AzureLinux) UserData(kubeletConfig *corev1beta1.KubeletConfiguration, ta
CABundle: caBundle,
GPUNode: u.Options.GPUNode,
GPUDriverVersion: u.Options.GPUDriverVersion,
// GPUImageSHA: u.Options.GPUImageSHA - GPU image SHA only applies to Ubuntu
// See: https://github.com/Azure/AgentBaker/blob/f393d6e4d689d9204d6000c85623ad9b764e2a29/vhdbuilder/packer/install-dependencies.sh#L201
SubnetID: u.Options.SubnetID,
GPUDriverType: u.Options.GPUDriverType,
GPUImageSHA: u.Options.GPUImageSHA,
SubnetID: u.Options.SubnetID,
},
Arch: u.Options.Arch,
TenantID: u.Options.TenantID,
Expand Down
2 changes: 2 additions & 0 deletions pkg/providers/imagefamily/bootstrap/aksbootstrap.go
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,7 @@ type NodeBootstrapVariables struct {
SwapFileSizeMB int // t user input
GPUImageSHA string // s static sha rarely updated
GPUDriverVersion string // k determine by OS + GPU hardware requirements; can be determined automatically, but hard. suggest using GPU operator.
GPUDriverType string // k
GPUInstanceProfile string // t user-specified
CustomSearchDomainName string // c user-specified [presumably cluster-level]
CustomSearchRealmUser string // c user-specified [presumably cluster-level]
Expand Down Expand Up @@ -468,6 +469,7 @@ func (a AKS) applyOptions(nbv *NodeBootstrapVariables) {
nbv.GPUNode = true
nbv.ConfigGPUDriverIfNeeded = true
nbv.GPUDriverVersion = a.GPUDriverVersion
nbv.GPUDriverType = a.GPUDriverType
nbv.GPUImageSHA = a.GPUImageSHA
}

Expand Down
1 change: 1 addition & 0 deletions pkg/providers/imagefamily/bootstrap/bootstrap.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ type Options struct {
CABundle *string
GPUNode bool
GPUDriverVersion string
GPUDriverType string
GPUImageSHA string
SubnetID string
}
Expand Down
1 change: 1 addition & 0 deletions pkg/providers/imagefamily/bootstrap/cse_cmd.sh.gtpl
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ KUBELET_CONFIG_FILE_CONTENT="{{.KubeletConfigFileContent}}"
SWAP_FILE_SIZE_MB="{{.SwapFileSizeMB}}"
GPU_IMAGE_SHA="{{.GPUImageSHA}}"
GPU_DRIVER_VERSION="{{.GPUDriverVersion}}"
GPU_DRIVER_TYPE="{{.GPUDriverType}}"
GPU_INSTANCE_PROFILE="{{.GPUInstanceProfile}}"
CUSTOM_SEARCH_DOMAIN_NAME="{{.CustomSearchDomainName}}"
CUSTOM_SEARCH_REALM_USER="{{.CustomSearchRealmUser}}"
Expand Down
1 change: 1 addition & 0 deletions pkg/providers/imagefamily/ubuntu_2204.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ func (u Ubuntu2204) UserData(kubeletConfig *corev1beta1.KubeletConfiguration, ta
GPUNode: u.Options.GPUNode,
GPUDriverVersion: u.Options.GPUDriverVersion,
GPUImageSHA: u.Options.GPUImageSHA,
GPUDriverType: u.Options.GPUDriverType,
SubnetID: u.Options.SubnetID,
},
Arch: u.Options.Arch,
Expand Down
21 changes: 18 additions & 3 deletions pkg/providers/instancetype/instancetypes.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
"github.com/mitchellh/hashstructure/v2"
"github.com/samber/lo"

"github.com/Azure/azure-sdk-for-go/profiles/latest/compute/mgmt/compute"
"github.com/Azure/go-autorest/autorest/to"
"github.com/Azure/karpenter-provider-azure/pkg/apis/v1alpha2"
kcache "github.com/Azure/karpenter-provider-azure/pkg/cache"
Expand Down Expand Up @@ -212,7 +213,8 @@ func (p *Provider) getInstanceTypes(ctx context.Context) (map[string]*skewer.SKU
continue
}

if !skus[i].HasLocationRestriction(p.region) && p.isSupported(&skus[i], vmsize) {
useSIG := false // replace with options.FromContext(ctx).UseSIG when available
if !skus[i].HasLocationRestriction(p.region) && p.isSupported(&skus[i], vmsize, useSIG) {
instanceTypes[skus[i].GetName()] = &skus[i]
}
}
Expand All @@ -229,13 +231,14 @@ func (p *Provider) getInstanceTypes(ctx context.Context) (map[string]*skewer.SKU
}

// isSupported indicates SKU is supported by AKS, based on SKU properties
func (p *Provider) isSupported(sku *skewer.SKU, vmsize *skewer.VMSizeType) bool {
func (p *Provider) isSupported(sku *skewer.SKU, vmsize *skewer.VMSizeType, useSIG bool) bool {
return p.hasMinimumCPU(sku) &&
p.hasMinimumMemory(sku) &&
!p.isUnsupportedByAKS(sku) &&
!p.isUnsupportedGPU(sku) &&
!p.hasConstrainedCPUs(vmsize) &&
!p.isConfidential(sku)
!p.isConfidential(sku) &&
isCompatibleImageAvailable(sku, useSIG)
}

// at least 2 cpus
Expand Down Expand Up @@ -344,3 +347,15 @@ var (
func hasZonalSupport(region string) bool {
return zonalRegions.Has(region)
}

func isCompatibleImageAvailable(sku *skewer.SKU, useSIG bool) bool {
hasSCSISupport := func(sku *skewer.SKU) bool { // TODO: move capability determination to skewer
const diskControllerTypeCapability = "DiskControllerTypes"
declaresSCSI := sku.HasCapabilityWithSeparator(diskControllerTypeCapability, string(compute.SCSI))
declaresNVMe := sku.HasCapabilityWithSeparator(diskControllerTypeCapability, string(compute.NVMe))
declaresNothing := !(declaresSCSI || declaresNVMe)
return declaresSCSI || declaresNothing // if nothing is declared, assume SCSI is supported
}

return useSIG || hasSCSISupport(sku) // CIG images are not currently tagged for NVMe
}
48 changes: 27 additions & 21 deletions pkg/providers/instancetype/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,9 @@ var _ = Describe("InstanceType Provider", func() {
It("should not include confidential SKUs", func() {
Expect(instanceTypes).ShouldNot(ContainElement(WithTransform(getName, Equal("Standard_DC8s_v3"))))
})
It("should not include SKUs without compatible image", func() {
Expect(instanceTypes).ShouldNot(ContainElement(WithTransform(getName, Equal("Standard_D2as_v6"))))
})
})
Context("Filtering GPU SKUs ProviderList(AzureLinux)", func() {
var instanceTypes corecloudprovider.InstanceTypes
Expand Down Expand Up @@ -1074,20 +1077,15 @@ var _ = Describe("InstanceType Provider", func() {
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
pod := coretest.UnschedulablePod(coretest.PodOptions{})
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
ExpectScheduled(ctx, env.Client, pod)
node := ExpectScheduled(ctx, env.Client, pod)

Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1))
vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM
Expect(vm.Properties).ToNot(BeNil())
Expect(vm.Properties.HardwareProfile).ToNot(BeNil())
Expect(utils.IsNvidiaEnabledSKU(string(*vm.Properties.HardwareProfile.VMSize))).To(BeFalse())

clusterNodes := cluster.Nodes()
node := clusterNodes[0]
if node.Name() == pod.Spec.NodeName {
nodeLabels := node.Labels()
Expect(nodeLabels).To(HaveKeyWithValue("karpenter.k8s.azure/sku-gpu-count", "0"))
}
Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-count", "0"))
})

It("should schedule GPU pod on GPU capable node", func() {
Expand Down Expand Up @@ -1117,23 +1115,31 @@ var _ = Describe("InstanceType Provider", func() {
})

ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
ExpectScheduled(ctx, env.Client, pod)

// Verify that the node has the GPU label set that the pod was scheduled on
clusterNodes := cluster.Nodes()
Expect(clusterNodes).ToNot(BeEmpty())
Expect(len(clusterNodes)).To(Equal(1))
node := clusterNodes[0]
Expect(node.Node.Status.Allocatable).To(HaveKeyWithValue(v1.ResourceName("nvidia.com/gpu"), resource.MustParse("1")))
node := ExpectScheduled(ctx, env.Client, pod)

if node.Name() == pod.Spec.NodeName {
nodeLabels := node.Labels()
// the following checks assume Standard_NC16as_T4_v3 (surprisingly the cheapest GPU in the test set), so test the assumption
Expect(node.Labels).To(HaveKeyWithValue("node.kubernetes.io/instance-type", "Standard_NC16as_T4_v3"))

Expect(nodeLabels).To(HaveKeyWithValue("karpenter.k8s.azure/sku-gpu-name", "A100"))
Expect(nodeLabels).To(HaveKeyWithValue("karpenter.k8s.azure/sku-gpu-manufacturer", v1alpha2.ManufacturerNvidia))
Expect(nodeLabels).To(HaveKeyWithValue("karpenter.k8s.azure/sku-gpu-count", "1"))
// Verify GPU related settings in bootstrap (assuming one Standard_NC16as_T4_v3)
customData := ExpectDecodedCustomData(azureEnv)
Expect(customData).To(SatisfyAll(
ContainSubstring("GPU_NODE=true"),
ContainSubstring("SGX_NODE=false"),
ContainSubstring("MIG_NODE=false"),
ContainSubstring("CONFIG_GPU_DRIVER_IF_NEEDED=true"),
ContainSubstring("ENABLE_GPU_DEVICE_PLUGIN_IF_NEEDED=false"),
ContainSubstring("GPU_DRIVER_TYPE=\"cuda\""),
ContainSubstring(fmt.Sprintf("GPU_DRIVER_VERSION=\"%s\"", utils.NvidiaCudaDriverVersion)),
ContainSubstring(fmt.Sprintf("GPU_IMAGE_SHA=\"%s\"", utils.AKSGPUCudaVersionSuffix)),
ContainSubstring("GPU_NEEDS_FABRIC_MANAGER=\"false\""),
ContainSubstring("GPU_INSTANCE_PROFILE=\"\""),
))

}
// Verify that the node the pod was scheduled on has GPU resource and labels set
Expect(node.Status.Allocatable).To(HaveKeyWithValue(v1.ResourceName("nvidia.com/gpu"), resource.MustParse("1")))
Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-name", "T4"))
Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-manufacturer", v1alpha2.ManufacturerNvidia))
Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-count", "1"))
})
})

Expand Down
1 change: 1 addition & 0 deletions pkg/providers/launchtemplate/launchtemplate.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ func (p *Provider) getStaticParameters(ctx context.Context, instanceType *cloudp
Arch: arch,
GPUNode: utils.IsNvidiaEnabledSKU(instanceType.Name),
GPUDriverVersion: utils.GetGPUDriverVersion(instanceType.Name),
GPUDriverType: utils.GetGPUDriverType(instanceType.Name),
GPUImageSHA: utils.GetAKSGPUImageSHA(instanceType.Name),
TenantID: p.tenantID,
SubscriptionID: p.subscriptionID,
Expand Down
1 change: 1 addition & 0 deletions pkg/providers/launchtemplate/parameters/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ type StaticParameters struct {
Arch string
GPUNode bool
GPUDriverVersion string
GPUDriverType string
GPUImageSHA string
TenantID string
SubscriptionID string
Expand Down
Loading

0 comments on commit ff53ecc

Please sign in to comment.