From 8cd5f51517a9314bdbd078ddaa4ff753631028ea Mon Sep 17 00:00:00 2001 From: r2k1 Date: Mon, 17 Feb 2025 13:21:28 +1300 Subject: [PATCH 01/19] set GPUNODE variable --- aks-node-controller/parser/parser.go | 1 + parts/linux/cloud-init/artifacts/cse_cmd.sh | 1 + pkg/agent/variables.go | 1 + 3 files changed, 3 insertions(+) diff --git a/aks-node-controller/parser/parser.go b/aks-node-controller/parser/parser.go index e60c1d1335d..9cfe4a248be 100644 --- a/aks-node-controller/parser/parser.go +++ b/aks-node-controller/parser/parser.go @@ -83,6 +83,7 @@ func getCSEEnv(config *aksnodeconfigv1.Configuration) map[string]string { "API_SERVER_NAME": config.GetApiServerConfig().GetApiServerName(), "IS_VHD": fmt.Sprintf("%v", getIsVHD(config.IsVhd)), "GPU_NODE": fmt.Sprintf("%v", getEnableNvidia(config)), + "AMD_GPU_NODE": fmt.Sprintf("%v", config.GetGpuConfig().GetEnableAmdGpu()), "SGX_NODE": fmt.Sprintf("%v", getIsSgxEnabledSKU(config.GetVmSize())), "MIG_NODE": fmt.Sprintf("%v", getIsMIGNode(config.GetGpuConfig().GetGpuInstanceProfile())), "CONFIG_GPU_DRIVER_IF_NEEDED": fmt.Sprintf("%v", config.GetGpuConfig().GetConfigGpuDriver()), diff --git a/parts/linux/cloud-init/artifacts/cse_cmd.sh b/parts/linux/cloud-init/artifacts/cse_cmd.sh index 06073b42aa2..cf081ce5603 100644 --- a/parts/linux/cloud-init/artifacts/cse_cmd.sh +++ b/parts/linux/cloud-init/artifacts/cse_cmd.sh @@ -60,6 +60,7 @@ USER_ASSIGNED_IDENTITY_ID={{GetVariable "userAssignedIdentityID"}} API_SERVER_NAME={{GetKubernetesEndpoint}} IS_VHD={{GetVariable "isVHD"}} GPU_NODE={{GetVariable "gpuNode"}} +AMD_GPU_NODE={{GetVariable "amdGpuNode"}} SGX_NODE={{GetVariable "sgxNode"}} MIG_NODE={{GetVariable "migNode"}} CONFIG_GPU_DRIVER_IF_NEEDED={{GetVariable "configGPUDriverIfNeeded"}} diff --git a/pkg/agent/variables.go b/pkg/agent/variables.go index 80eedb389f5..f08a09922b6 100644 --- a/pkg/agent/variables.go +++ b/pkg/agent/variables.go @@ -113,6 +113,7 @@ func getCSECommandVariables(config *datamodel.NodeBootstrappingConfiguration) pa "userAssignedIdentityID": config.UserAssignedIdentityClientID, "isVHD": isVHD(profile), "gpuNode": strconv.FormatBool(config.EnableNvidia), + "amdGpuNode": strconv.FormatBool(config.EnableAMDGPU), "sgxNode": strconv.FormatBool(datamodel.IsSgxEnabledSKU(profile.VMSize)), "configGPUDriverIfNeeded": config.ConfigGPUDriverIfNeeded, "enableGPUDevicePluginIfNeeded": config.EnableGPUDevicePluginIfNeeded, From af0c6a97ef5d18ad2527ab8718490f1777b8e55b Mon Sep 17 00:00:00 2001 From: r2k1 Date: Mon, 17 Feb 2025 13:32:03 +1300 Subject: [PATCH 02/19] add placeholder to install AMD GPU drivers --- parts/linux/cloud-init/artifacts/cse_helpers.sh | 2 +- parts/linux/cloud-init/artifacts/cse_main.sh | 13 +++++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/cse_helpers.sh b/parts/linux/cloud-init/artifacts/cse_helpers.sh index a13de062ad1..12308b39abb 100755 --- a/parts/linux/cloud-init/artifacts/cse_helpers.sh +++ b/parts/linux/cloud-init/artifacts/cse_helpers.sh @@ -523,7 +523,7 @@ logs_to_events() { fi } -should_skip_nvidia_drivers() { +should_skip_gpu_drivers() { set -x body=$(curl -fsSL -H "Metadata: true" --noproxy "*" "http://169.254.169.254/metadata/instance?api-version=2021-02-01") ret=$? diff --git a/parts/linux/cloud-init/artifacts/cse_main.sh b/parts/linux/cloud-init/artifacts/cse_main.sh index 2917e385a5f..7c76205cb69 100755 --- a/parts/linux/cloud-init/artifacts/cse_main.sh +++ b/parts/linux/cloud-init/artifacts/cse_main.sh @@ -97,15 +97,15 @@ if [[ -n ${BOOTSTRAP_PROFILE_CONTAINER_REGISTRY_SERVER} ]]; then logs_to_events "AKS.CSE.orasLogin.oras_login_with_kubelet_identity" oras_login_with_kubelet_identity "${BOOTSTRAP_PROFILE_CONTAINER_REGISTRY_SERVER%/}" $USER_ASSIGNED_IDENTITY_ID $TENANT_ID || exit $? fi -export -f should_skip_nvidia_drivers -skip_nvidia_driver_install=$(retrycmd_if_failure_no_stats 10 1 10 bash -cx should_skip_nvidia_drivers) +export -f should_skip_gpu_drivers +skip_gpu_driver_install=$(retrycmd_if_failure_no_stats 10 1 10 bash -cx should_skip_gpu_drivers) ret=$? if [[ "$ret" != "0" ]]; then echo "Failed to determine if nvidia driver install should be skipped" exit $ERR_NVIDIA_DRIVER_INSTALL fi -if [[ "${GPU_NODE}" != "true" ]] || [[ "${skip_nvidia_driver_install}" == "true" ]]; then +if [[ "${GPU_NODE}" != "true" ]] || [[ "${skip_gpu_driver_install}" == "true" ]]; then logs_to_events "AKS.CSE.cleanUpGPUDrivers" cleanUpGPUDrivers fi @@ -157,7 +157,7 @@ fi REBOOTREQUIRED=false echo $(date),$(hostname), "Start configuring GPU drivers" -if [[ "${GPU_NODE}" = true ]] && [[ "${skip_nvidia_driver_install}" != "true" ]]; then +if [[ "${GPU_NODE}" = true ]] && [[ "${skip_gpu_driver_install}" != "true" ]]; then logs_to_events "AKS.CSE.ensureGPUDrivers" ensureGPUDrivers if [[ "${ENABLE_GPU_DEVICE_PLUGIN_IF_NEEDED}" = true ]]; then if [[ "${MIG_NODE}" == "true" ]] && [[ -f "/etc/systemd/system/nvidia-device-plugin.service" ]]; then @@ -206,6 +206,11 @@ EOF fi fi +if [[ "${AMD_GPU_NODE}" = true ]] && [[ "${skip_gpu_driver_install}" != "true" ]]; then + logs_to_events "AKS.CSE.ensureAMDGPUDrivers" ensureAMDGPUDrivers +fi + + echo $(date),$(hostname), "End configuring GPU drivers" if [ "${NEEDS_DOCKER_LOGIN}" == "true" ]; then From 034e069312c0d48c08b332427ee7f390df7d74ca Mon Sep 17 00:00:00 2001 From: r2k1 Date: Mon, 17 Feb 2025 13:37:36 +1300 Subject: [PATCH 03/19] add script to install AMD drivers --- parts/linux/cloud-init/artifacts/cse_config.sh | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/parts/linux/cloud-init/artifacts/cse_config.sh b/parts/linux/cloud-init/artifacts/cse_config.sh index 710552ffbac..2aafc964481 100755 --- a/parts/linux/cloud-init/artifacts/cse_config.sh +++ b/parts/linux/cloud-init/artifacts/cse_config.sh @@ -844,6 +844,24 @@ ensureGPUDrivers() { fi } +# TODO: this is a temporary ubuntu-only HACK until we get a driver +ensureAMDGPUDrivers() { + echo "Installing AMD GPU drivers" + + # delete amdgpu module from blacklist + sudo sed -i '/blacklist amdgpu/d' /etc/modprobe.d/blacklist-radeon-instinct.conf + + # temporary solution, until the driver is available in MCR + sudo apt-get update + wget https://repo.radeon.com/amdgpu-install/6.3.1/ubuntu/jammy/amdgpu-install_6.3.60301-1_all.deb + sudo apt-get install -y ./amdgpu-install_6.3.60301-1_all.deb + sudo apt-get update + sudo apt-get install -y amdgpu-dkms + + REBOOTREQUIRED=true + echo "AMD GPU drivers installed" +} + disableSSH() { systemctlDisableAndStop ssh || exit $ERR_DISABLE_SSH } From 7c9e2c2c5edb90306b47cc31a8594125b1672c07 Mon Sep 17 00:00:00 2001 From: r2k1 Date: Mon, 17 Feb 2025 14:11:09 +1300 Subject: [PATCH 04/19] add amd gpu test --- e2e/config/azure.go | 40 +++++++++++++++-------------- e2e/kube.go | 49 +++++++++++++++++++++++++++++++++++ e2e/scenario_test.go | 61 ++++++++++++++++++++++++++++++++++++++++++++ e2e/validators.go | 12 +++++++++ 4 files changed, 143 insertions(+), 19 deletions(-) diff --git a/e2e/config/azure.go b/e2e/config/azure.go index 276dc7176bf..0884e39903a 100644 --- a/e2e/config/azure.go +++ b/e2e/config/azure.go @@ -291,25 +291,27 @@ func (a *AzureClient) UploadAndGetSignedLink(ctx context.Context, blobName strin } func (a *AzureClient) CreateVMManagedIdentity(ctx context.Context) (string, error) { - identity, err := a.UserAssignedIdentities.CreateOrUpdate(ctx, ResourceGroupName, VMIdentityName, armmsi.Identity{ - Location: to.Ptr(Config.Location), - }, nil) - if err != nil { - return "", fmt.Errorf("create managed identity: %w", err) - } - err = a.createBlobStorageAccount(ctx) - if err != nil { - return "", err - } - err = a.createBlobStorageContainer(ctx) - if err != nil { - return "", err - } - - if err := a.assignRolesToVMIdentity(ctx, identity.Properties.PrincipalID); err != nil { - return "", err - } - return *identity.Properties.ClientID, nil + // HACK: temporary disable to allow running test in different subscription, without enough permissions + return "", nil + // identity, err := a.UserAssignedIdentities.CreateOrUpdate(ctx, ResourceGroupName, VMIdentityName, armmsi.Identity{ + // Location: to.Ptr(Config.Location), + // }, nil) + // if err != nil { + // return "", fmt.Errorf("create managed identity: %w", err) + // } + // err = a.createBlobStorageAccount(ctx) + // if err != nil { + // return "", err + // } + // err = a.createBlobStorageContainer(ctx) + // if err != nil { + // return "", err + // } + + // if err := a.assignRolesToVMIdentity(ctx, identity.Properties.PrincipalID); err != nil { + // return "", err + // } + // return *identity.Properties.ClientID, nil } func (a *AzureClient) createBlobStorageAccount(ctx context.Context) error { diff --git a/e2e/kube.go b/e2e/kube.go index 5f7455b236c..d19bc99ff28 100644 --- a/e2e/kube.go +++ b/e2e/kube.go @@ -637,3 +637,52 @@ func nvidiaDevicePluginDaemonSet() *appsv1.DaemonSet { }, } } + +func podEnableAMDGPUResource(s *Scenario) *corev1.Pod { + return &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf("%s-amdgpu-device-plugin", s.Runtime.KubeNodeName), + Namespace: defaultNamespace, + }, + Spec: corev1.PodSpec{ + PriorityClassName: "system-node-critical", + NodeSelector: map[string]string{ + "kubernetes.io/hostname": s.Runtime.KubeNodeName, + }, + Containers: []corev1.Container{ + { + Name: "amdgpu-device-plugin-container", + Image: "rocm/k8s-device-plugin", + VolumeMounts: []corev1.VolumeMount{ + { + Name: "device-plugin", + MountPath: "/var/lib/kubelet/device-plugins", + }, + { + Name: "sys", + MountPath: "/sys", + }, + }, + }, + }, + Volumes: []corev1.Volume{ + { + Name: "device-plugin", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: "/var/lib/kubelet/device-plugins", + }, + }, + }, + { + Name: "sys", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: "/sys", + }, + }, + }, + }, + }, + } +} diff --git a/e2e/scenario_test.go b/e2e/scenario_test.go index 748e6ba116f..47c91357fbb 100644 --- a/e2e/scenario_test.go +++ b/e2e/scenario_test.go @@ -1664,3 +1664,64 @@ func Test_Ubuntu2404ARM(t *testing.T) { }, }) } + +func Test_Ubuntu2204Gen2Containerd_AMDGPU_MI300(t *testing.T) { + //t.Skip("Provisioning of Standard_ND96isr_MI300X_v5 isn't reliable yet") + RunScenario(t, &Scenario{ + Description: "Tests that a GPU-enabled node using a MarinerV2 VHD can be properly bootstrapped", + Tags: Tags{ + GPU: true, + }, + Config: Config{ + Cluster: ClusterKubenet, + VHD: config.VHDUbuntu2204Gen2Containerd, //TODO: add support for older + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + nbc.ContainerService.Properties.AgentPoolProfiles[0].VMSize = "Standard_ND96isr_MI300X_v5" + nbc.ContainerService.Properties.AgentPoolProfiles[0].Distro = "aks-cblmariner-v2-gen2" + nbc.AgentPoolProfile.VMSize = "Standard_ND96isr_MI300X_v5" + nbc.AgentPoolProfile.Distro = "aks-cblmariner-v2-gen2" + nbc.EnableAMDGPU = true + nbc.ConfigGPUDriverIfNeeded = true + }, + VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { + vmss.SKU.Name = to.Ptr("Standard_ND96isr_MI300X_v5") + vmss.Properties.VirtualMachineProfile.StorageProfile.OSDisk.DiskSizeGB = to.Ptr[int32](128) // drivers and gpu images are huge, give us some headroom + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateAMDGPU(ctx, s) + }, + }, + }) +} + +func Test_Ubuntu2204Gen2Containerd_AMDGPU_V710(t *testing.T) { + // the SKU isn't available in subscriptrion/region we run tests + //t.Skip("Provisioning of NV4ads_V710_v5 isn't reliable yet") + // LOCATION=southcentralus + RunScenario(t, &Scenario{ + Description: "Tests that a GPU-enabled node using a MarinerV2 VHD can be properly bootstrapped", + Tags: Tags{ + GPU: true, + }, + Config: Config{ + Cluster: ClusterKubenet, + VHD: config.VHDUbuntu2204Gen2Containerd, + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + nbc.ContainerService.Properties.AgentPoolProfiles[0].VMSize = "Standard_NV4ads_V710_v5" + nbc.ContainerService.Properties.AgentPoolProfiles[0].Distro = "aks-cblmariner-v2-gen2" + nbc.AgentPoolProfile.VMSize = "Standard_NV4ads_V710_v5" + nbc.AgentPoolProfile.Distro = "aks-cblmariner-v2-gen2" + nbc.EnableAMDGPU = true + nbc.ConfigGPUDriverIfNeeded = true + + }, + VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { + vmss.SKU.Name = to.Ptr("Standard_NV4ads_V710_v5") + vmss.Properties.VirtualMachineProfile.StorageProfile.OSDisk.DiskSizeGB = to.Ptr[int32](128) // drivers and gpu images are huge, give us some headroom + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateAMDGPU(ctx, s) + }, + }, + }) +} diff --git a/e2e/validators.go b/e2e/validators.go index c56c6fb4a61..e7c0952a778 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -427,3 +427,15 @@ func GetFieldFromJsonObjectOnNode(ctx context.Context, s *Scenario, fileName str return podExecResult.stdout.String() } + +func ValidateAMDGPU(ctx context.Context, s *Scenario) { + s.T.Logf("validating pod using AMD GPU") + + execResult := execScriptOnVMForScenario(ctx, s, "lspci -k") + require.Equal(s.T, "0", execResult.exitCode, "expected to find lspci command, but did not") + assert.Contains(s.T, execResult.stdout.String(), "amdgpu", "expected to see amdgpu kernel module managing a PCI device, but did not") + + ensurePod(ctx, s, podEnableAMDGPUResource(s)) + waitUntilResourceAvailable(ctx, s, "amd.com/gpu") + //ensureJob(ctx, s, jobAMDGPUWorkload(s)) +} \ No newline at end of file From 12cb3f937e91d8519d314d56df454e369e2aef14 Mon Sep 17 00:00:00 2001 From: r2k1 Date: Tue, 18 Feb 2025 12:23:54 +1300 Subject: [PATCH 05/19] cache dependencies --- e2e/vmss.go | 1 - .../linux/cloud-init/artifacts/cse_config.sh | 12 ++----- parts/linux/cloud-init/artifacts/cse_main.sh | 5 +++ vhdbuilder/packer/install-dependencies.sh | 33 +++++++++++++++++++ vhdbuilder/packer/pre-install-dependencies.sh | 2 +- 5 files changed, 42 insertions(+), 11 deletions(-) diff --git a/e2e/vmss.go b/e2e/vmss.go index 31d6fe17c7a..6443ef3a928 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -36,7 +36,6 @@ const ( func createVMSS(ctx context.Context, s *Scenario) *armcompute.VirtualMachineScaleSet { cluster := s.Runtime.Cluster - s.T.Logf("creating VMSS %q in resource group %q", s.Runtime.VMSSName, *cluster.Model.Properties.NodeResourceGroup) var nodeBootstrapping *datamodel.NodeBootstrapping ab, err := agent.NewAgentBaker() require.NoError(s.T, err) diff --git a/parts/linux/cloud-init/artifacts/cse_config.sh b/parts/linux/cloud-init/artifacts/cse_config.sh index 2aafc964481..f47342e2e16 100755 --- a/parts/linux/cloud-init/artifacts/cse_config.sh +++ b/parts/linux/cloud-init/artifacts/cse_config.sh @@ -848,15 +848,9 @@ ensureGPUDrivers() { ensureAMDGPUDrivers() { echo "Installing AMD GPU drivers" - # delete amdgpu module from blacklist - sudo sed -i '/blacklist amdgpu/d' /etc/modprobe.d/blacklist-radeon-instinct.conf - - # temporary solution, until the driver is available in MCR - sudo apt-get update - wget https://repo.radeon.com/amdgpu-install/6.3.1/ubuntu/jammy/amdgpu-install_6.3.60301-1_all.deb - sudo apt-get install -y ./amdgpu-install_6.3.60301-1_all.deb - sudo apt-get update - sudo apt-get install -y amdgpu-dkms + pushd /var/cache/amdgpu-apt + sudo dpkg -i *.deb + popd REBOOTREQUIRED=true echo "AMD GPU drivers installed" diff --git a/parts/linux/cloud-init/artifacts/cse_main.sh b/parts/linux/cloud-init/artifacts/cse_main.sh index 7c76205cb69..e896b12a3ca 100755 --- a/parts/linux/cloud-init/artifacts/cse_main.sh +++ b/parts/linux/cloud-init/artifacts/cse_main.sh @@ -208,6 +208,11 @@ fi if [[ "${AMD_GPU_NODE}" = true ]] && [[ "${skip_gpu_driver_install}" != "true" ]]; then logs_to_events "AKS.CSE.ensureAMDGPUDrivers" ensureAMDGPUDrivers +else + # delete cached amd gpu packages to save disk space + sudo rm /etc/apt/keyrings/rocm.gpg + sudo rm /etc/apt/sources.list.d/amdgpu.list + sudo rm -rf /var/cache/amdgpu-apt/* fi diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 787faad1a18..79e8bf355ee 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -619,3 +619,36 @@ rm -f ./azcopy # cleanup immediately after usage will return in two downloads echo "install-dependencies step completed successfully" capture_benchmark "${SCRIPT_NAME}_overall" true process_benchmarks + + +download_amdgpu_drivers() { + if [[ $OS != $UBUNTU_OS_NAME ]]; then + echo "Skipping AMD GPU driver setup: Unsupported OS (${OS})" + return + fi + echo "Downloading AMD GPU drivers for Ubuntu ${UBUNTU_RELEASE}" + # Determine the appropriate Ubuntu release + if [ "${UBUNTU_RELEASE}" == "22.04" ]; then + DISTRO="jammy" + elif [ "${UBUNTU_RELEASE}" == "24.04" ]; then + DISTRO="noble" + else + echo "Skipping AMD GPU driver setup: Unsupported Ubuntu release (${UBUNTU_RELEASE})" + return + fi + + sudo mkdir --parents --mode=0755 /etc/apt/keyrings + wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | \ + gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null + sudo chmod 0644 /etc/apt/keyrings/rocm.gpg + echo "deb [arch=amd64,i386 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/amdgpu/6.3.2/ubuntu ${DISTRO} main" \ + | sudo tee /etc/apt/sources.list.d/amdgpu.list + sudo apt-get update + + # Download to /var/cache/apt/archives/ + sudo mkdir -p /var/cache/amdgpu-apt/ + sudo chmod 777 /var/cache/amdgpu-apt/ + sudo apt-get install -o Dir::Cache::Archives="/var/cache/amdgpu-apt" --download-only -y amdgpu-dkms +} + +download_amdgpu_drivers \ No newline at end of file diff --git a/vhdbuilder/packer/pre-install-dependencies.sh b/vhdbuilder/packer/pre-install-dependencies.sh index 71a681bc92d..ce70973325a 100644 --- a/vhdbuilder/packer/pre-install-dependencies.sh +++ b/vhdbuilder/packer/pre-install-dependencies.sh @@ -132,4 +132,4 @@ fi capture_benchmark "${SCRIPT_NAME}_handle_azureLinux_and_cgroupV2" echo "pre-install-dependencies step finished successfully" capture_benchmark "${SCRIPT_NAME}_overall" true -process_benchmarks \ No newline at end of file +process_benchmarks From b71f5a939e20f3b7eb61c4cc0c34f3f088a92d78 Mon Sep 17 00:00:00 2001 From: r2k1 Date: Tue, 18 Feb 2025 20:39:01 +1300 Subject: [PATCH 06/19] install amdgpu dependencies --- e2e/validators.go | 4 ++-- vhdbuilder/packer/install-dependencies.sh | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/e2e/validators.go b/e2e/validators.go index e7c0952a778..6ac1f40edf6 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -334,7 +334,7 @@ func waitUntilResourceAvailable(ctx context.Context, s *Scenario, resourceName s nodeName := s.Runtime.KubeNodeName ticker := time.NewTicker(time.Second) defer ticker.Stop() - + s.T.Logf("waiting for resource %q to be available on node %q", resourceName, nodeName) for { select { case <-ctx.Done(): @@ -438,4 +438,4 @@ func ValidateAMDGPU(ctx context.Context, s *Scenario) { ensurePod(ctx, s, podEnableAMDGPUResource(s)) waitUntilResourceAvailable(ctx, s, "amd.com/gpu") //ensureJob(ctx, s, jobAMDGPUWorkload(s)) -} \ No newline at end of file +} diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 79e8bf355ee..3806ff2428f 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -648,7 +648,7 @@ download_amdgpu_drivers() { # Download to /var/cache/apt/archives/ sudo mkdir -p /var/cache/amdgpu-apt/ sudo chmod 777 /var/cache/amdgpu-apt/ - sudo apt-get install -o Dir::Cache::Archives="/var/cache/amdgpu-apt" --download-only -y amdgpu-dkms + sudo apt-get install -o Dir::Cache::Archives="/var/cache/amdgpu-apt" --download-only -y amdgpu-dkms autoconf automake autotools-dev m4 } download_amdgpu_drivers \ No newline at end of file From da9ce36310436c130da85073da1f7128f5c5e1f8 Mon Sep 17 00:00:00 2001 From: r2k1 Date: Tue, 18 Feb 2025 21:39:43 +1300 Subject: [PATCH 07/19] install amdgpu dependencies --- vhdbuilder/packer/install-dependencies.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 3806ff2428f..7a262541fb6 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -648,7 +648,7 @@ download_amdgpu_drivers() { # Download to /var/cache/apt/archives/ sudo mkdir -p /var/cache/amdgpu-apt/ sudo chmod 777 /var/cache/amdgpu-apt/ - sudo apt-get install -o Dir::Cache::Archives="/var/cache/amdgpu-apt" --download-only -y amdgpu-dkms autoconf automake autotools-dev m4 + sudo apt-get install -o Dir::Cache::Archives="/var/cache/amdgpu-apt" --download-only -y m4 amdgpu-dkms autoconf automake autotools-dev } download_amdgpu_drivers \ No newline at end of file From 8e3efd2ba0f7faa87ee46a024c8654ca6336b015 Mon Sep 17 00:00:00 2001 From: r2k1 Date: Wed, 19 Feb 2025 10:13:04 +1300 Subject: [PATCH 08/19] log ssh instructions earlier --- e2e/exec.go | 19 +++++++++++++------ e2e/scenario_helpers_test.go | 5 ++++- e2e/validation.go | 5 ----- e2e/validators.go | 2 +- e2e/vmss.go | 17 ++++------------- 5 files changed, 22 insertions(+), 26 deletions(-) diff --git a/e2e/exec.go b/e2e/exec.go index 6b2ad8985c2..440eb746d6d 100644 --- a/e2e/exec.go +++ b/e2e/exec.go @@ -8,6 +8,7 @@ import ( "github.com/Azure/agentbaker/e2e/config" "github.com/google/uuid" + "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/tools/remotecommand" @@ -54,7 +55,7 @@ type Script struct { interpreter Interpreter } -func execScriptOnVm(ctx context.Context, s *Scenario, vmPrivateIP, jumpboxPodName, sshPrivateKey string, script Script) (*podExecResult, error) { +func execScriptOnVm(ctx context.Context, s *Scenario, script Script) (*podExecResult, error) { /* This works in a way that doesn't rely on the node having joined the cluster: * We create a linux pod on a different node. @@ -77,13 +78,13 @@ func execScriptOnVm(ctx context.Context, s *Scenario, vmPrivateIP, jumpboxPodNam } steps := []string{ - fmt.Sprintf("echo '%[1]s' > %[2]s", sshPrivateKey, sshKeyName(vmPrivateIP)), + fmt.Sprintf("echo '%[1]s' > %[2]s", s.Runtime.SSHKeyPrivate, sshKeyName(s.Runtime.VMPrivateIP)), "set -x", fmt.Sprintf("echo %[1]s > %[2]s", quoteForBash(script.script), scriptFileName), - fmt.Sprintf("chmod 0600 %s", sshKeyName(vmPrivateIP)), + fmt.Sprintf("chmod 0600 %s", sshKeyName(s.Runtime.VMPrivateIP)), fmt.Sprintf("chmod 0755 %s", scriptFileName), - fmt.Sprintf(`scp -i %[1]s -o PasswordAuthentication=no -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o ConnectTimeout=5 %[3]s azureuser@%[2]s:%[4]s`, sshKeyName(vmPrivateIP), vmPrivateIP, scriptFileName, remoteScriptFileName), - fmt.Sprintf("%s %s %s", sshString(vmPrivateIP), interpreter, remoteScriptFileName), + fmt.Sprintf(`scp -i %[1]s -o PasswordAuthentication=no -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o ConnectTimeout=5 %[3]s azureuser@%[2]s:%[4]s`, sshKeyName(s.Runtime.VMPrivateIP), s.Runtime.VMPrivateIP, scriptFileName, remoteScriptFileName), + fmt.Sprintf("%s %s %s", sshString(s.Runtime.VMPrivateIP), interpreter, remoteScriptFileName), } joinedSteps := strings.Join(steps, " && ") @@ -91,7 +92,7 @@ func execScriptOnVm(ctx context.Context, s *Scenario, vmPrivateIP, jumpboxPodNam s.T.Logf("Executing script %[1]s using %[2]s:\n---START-SCRIPT---\n%[3]s\n---END-SCRIPT---\n", scriptFileName, interpreter, script.script) kube := s.Runtime.Cluster.Kube - execResult, err := execOnPrivilegedPod(ctx, kube, defaultNamespace, jumpboxPodName, joinedSteps) + execResult, err := execOnPrivilegedPod(ctx, kube, defaultNamespace, s.Runtime.Cluster.DebugPod.Name, joinedSteps) if err != nil { return nil, fmt.Errorf("error executing command on pod: %w", err) } @@ -172,6 +173,12 @@ func unprivilegedCommandArray() []string { } } +func uploadSSHKey(ctx context.Context, s *Scenario) { + // hack, ssh key is uploaded as a side-effect of executing a script + _, err := execScriptOnVm(ctx, s, Script{}) + require.NoError(s.T, err) +} + func logSSHInstructions(s *Scenario) { result := "SSH Instructions:" if !config.Config.KeepVMSS { diff --git a/e2e/scenario_helpers_test.go b/e2e/scenario_helpers_test.go index bf777a53036..2595583f0b9 100644 --- a/e2e/scenario_helpers_test.go +++ b/e2e/scenario_helpers_test.go @@ -98,8 +98,9 @@ func RunScenario(t *testing.T, s *Scenario) { ctx, cancel := context.WithTimeout(ctx, config.Config.TestTimeoutVMSS) defer cancel() prepareAKSNode(ctx, s) - t.Logf("Choosing the private ACR %q for the vm validation", config.GetPrivateACRName(s.Tags.NonAnonymousACR)) + logSSHInstructions(s) + validateVM(ctx, s) } @@ -150,6 +151,8 @@ func prepareAKSNode(ctx context.Context, s *Scenario) { s.Runtime.VMPrivateIP, err = getVMPrivateIPAddress(ctx, s) require.NoError(s.T, err, "failed to get VM private IP address") + + uploadSSHKey(ctx, s) } func maybeSkipScenario(ctx context.Context, t *testing.T, s *Scenario) { diff --git a/e2e/validation.go b/e2e/validation.go index 25d595845be..3b3287f9847 100644 --- a/e2e/validation.go +++ b/e2e/validation.go @@ -38,11 +38,6 @@ func ValidateCommonLinux(ctx context.Context, s *Scenario) { stdout := execResult.stdout.String() require.NotContains(s.T, stdout, "--dynamic-config-dir", "kubelet flag '--dynamic-config-dir' should not be present in /etc/default/kubelet\nContents:\n%s") - // the instructions belows expects the SSH key to be uploaded to the user pool VM. - // which happens as a side-effect of execCommandOnVMForScenario, it's ugly but works. - // maybe we should use a single ssh key per cluster, but need to be careful with parallel test runs. - logSSHInstructions(s) - ValidateSysctlConfig(ctx, s, map[string]string{ "net.ipv4.tcp_retries2": "8", "net.core.message_burst": "80", diff --git a/e2e/validators.go b/e2e/validators.go index 6ac1f40edf6..478124336f3 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -174,7 +174,7 @@ func execScriptOnVMForScenario(ctx context.Context, s *Scenario, cmd string) *po script.interpreter = Bash } - result, err := execScriptOnVm(ctx, s, s.Runtime.VMPrivateIP, s.Runtime.Cluster.DebugPod.Name, string(s.Runtime.SSHKeyPrivate), script) + result, err := execScriptOnVm(ctx, s, script) require.NoError(s.T, err, "failed to execute command on VM") return result } diff --git a/e2e/vmss.go b/e2e/vmss.go index 6443ef3a928..c1e1f70770a 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -111,9 +111,6 @@ func extractLogsFromVM(ctx context.Context, s *Scenario) { } func extractLogsFromVMLinux(ctx context.Context, s *Scenario) { - privateIP, err := getVMPrivateIPAddress(ctx, s) - require.NoError(s.T, err) - commandList := map[string]string{ "cluster-provision.log": "sudo cat /var/log/azure/cluster-provision.log", "kubelet.log": "sudo journalctl -u kubelet", @@ -122,30 +119,25 @@ func extractLogsFromVMLinux(ctx context.Context, s *Scenario) { "aks-node-controller.log": "sudo cat /var/log/azure/aks-node-controller.log", } - pod, err := s.Runtime.Cluster.Kube.GetHostNetworkDebugPod(ctx, s.T) - if err != nil { - require.NoError(s.T, err) - } - var logFiles = map[string]string{} for file, sourceCmd := range commandList { - execResult, err := execBashCommandOnVM(ctx, s, privateIP, pod.Name, string(s.Runtime.SSHKeyPrivate), sourceCmd) + execResult, err := execBashCommandOnVM(ctx, s, sourceCmd) if err != nil { s.T.Logf("error executing %s: %s", sourceCmd, err) continue } logFiles[file] = execResult.String() } - err = dumpFileMapToDir(s.T, logFiles) + err := dumpFileMapToDir(s.T, logFiles) require.NoError(s.T, err) } -func execBashCommandOnVM(ctx context.Context, s *Scenario, vmPrivateIP, jumpboxPodName, sshPrivateKey, command string) (*podExecResult, error) { +func execBashCommandOnVM(ctx context.Context, s *Scenario, command string) (*podExecResult, error) { script := Script{ interpreter: Bash, script: command, } - return execScriptOnVm(ctx, s, vmPrivateIP, jumpboxPodName, sshPrivateKey, script) + return execScriptOnVm(ctx, s, script) } const uploadLogsPowershellScript = ` @@ -240,7 +232,6 @@ func extractLogsFromVMWindows(ctx context.Context, s *Scenario) { return } s.T.Logf("run command executed successfully: %v", runCommandResp) - s.T.Logf("uploaded logs to %s", blobUrl) downloadBlob := func(blobSuffix string) { From 09c051a42680b320e021c4c95117821b827e8b43 Mon Sep 17 00:00:00 2001 From: r2k1 Date: Wed, 19 Feb 2025 10:13:24 +1300 Subject: [PATCH 09/19] update installation step --- parts/linux/cloud-init/artifacts/cse_config.sh | 1 + vhdbuilder/packer/install-dependencies.sh | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/parts/linux/cloud-init/artifacts/cse_config.sh b/parts/linux/cloud-init/artifacts/cse_config.sh index f47342e2e16..a8703bf09d7 100755 --- a/parts/linux/cloud-init/artifacts/cse_config.sh +++ b/parts/linux/cloud-init/artifacts/cse_config.sh @@ -849,6 +849,7 @@ ensureAMDGPUDrivers() { echo "Installing AMD GPU drivers" pushd /var/cache/amdgpu-apt + ls -l sudo dpkg -i *.deb popd diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 7a262541fb6..1a73af51bc6 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -648,7 +648,11 @@ download_amdgpu_drivers() { # Download to /var/cache/apt/archives/ sudo mkdir -p /var/cache/amdgpu-apt/ sudo chmod 777 /var/cache/amdgpu-apt/ - sudo apt-get install -o Dir::Cache::Archives="/var/cache/amdgpu-apt" --download-only -y m4 amdgpu-dkms autoconf automake autotools-dev + # Download all dependencies of the amdgpu-dkms package + # The --reinstall flag is used to ensure that the package is downloaded even if it is already installed + # Otherwise installation of some packages like "m4" is skipped because it is already installed + # "m4" seems to be deleted at the later stage, making the installation fail + sudo apt-get install -o Dir::Cache::Archives="/var/cache/amdgpu-apt" --download-only --reinstall -y m4 amdgpu-dkms autoconf automake autotools-dev amdgpu-dkms-firmware } download_amdgpu_drivers \ No newline at end of file From 75a2722e22aa1d1d2c99028fb30cd1f8f82a0d38 Mon Sep 17 00:00:00 2001 From: r2k1 Date: Wed, 19 Feb 2025 13:36:12 +1300 Subject: [PATCH 10/19] fix script --- parts/linux/cloud-init/artifacts/cse_config.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/parts/linux/cloud-init/artifacts/cse_config.sh b/parts/linux/cloud-init/artifacts/cse_config.sh index a8703bf09d7..55aebe30b0a 100755 --- a/parts/linux/cloud-init/artifacts/cse_config.sh +++ b/parts/linux/cloud-init/artifacts/cse_config.sh @@ -848,6 +848,9 @@ ensureGPUDrivers() { ensureAMDGPUDrivers() { echo "Installing AMD GPU drivers" + # delete amdgpu module from blacklist + sudo sed -i '/blacklist amdgpu/d' /etc/modprobe.d/blacklist-radeon-instinct.conf + pushd /var/cache/amdgpu-apt ls -l sudo dpkg -i *.deb From 511feb3b983afbb93323b6b1c7c3a8ecd70c2589 Mon Sep 17 00:00:00 2001 From: r2k1 Date: Wed, 19 Feb 2025 13:36:35 +1300 Subject: [PATCH 11/19] simplify ssh key upload --- e2e/exec.go | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/e2e/exec.go b/e2e/exec.go index 440eb746d6d..dcd03a6032b 100644 --- a/e2e/exec.go +++ b/e2e/exec.go @@ -78,10 +78,8 @@ func execScriptOnVm(ctx context.Context, s *Scenario, script Script) (*podExecRe } steps := []string{ - fmt.Sprintf("echo '%[1]s' > %[2]s", s.Runtime.SSHKeyPrivate, sshKeyName(s.Runtime.VMPrivateIP)), "set -x", fmt.Sprintf("echo %[1]s > %[2]s", quoteForBash(script.script), scriptFileName), - fmt.Sprintf("chmod 0600 %s", sshKeyName(s.Runtime.VMPrivateIP)), fmt.Sprintf("chmod 0755 %s", scriptFileName), fmt.Sprintf(`scp -i %[1]s -o PasswordAuthentication=no -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o ConnectTimeout=5 %[3]s azureuser@%[2]s:%[4]s`, sshKeyName(s.Runtime.VMPrivateIP), s.Runtime.VMPrivateIP, scriptFileName, remoteScriptFileName), fmt.Sprintf("%s %s %s", sshString(s.Runtime.VMPrivateIP), interpreter, remoteScriptFileName), @@ -174,9 +172,10 @@ func unprivilegedCommandArray() []string { } func uploadSSHKey(ctx context.Context, s *Scenario) { - // hack, ssh key is uploaded as a side-effect of executing a script - _, err := execScriptOnVm(ctx, s, Script{}) - require.NoError(s.T, err) + cmd := fmt.Sprintf("echo '%[1]s' > %[2]s && chmod 0600 %[2]s", s.Runtime.SSHKeyPrivate, sshKeyName(s.Runtime.VMPrivateIP)) + kube := s.Runtime.Cluster.Kube + _, err := execOnPrivilegedPod(ctx, kube, defaultNamespace, s.Runtime.Cluster.DebugPod.Name, cmd) + require.NoError(s.T, err, "error uploading ssh key to pod") } func logSSHInstructions(s *Scenario) { From 2a1b1d19b6be70508209cb4346e2422747533279 Mon Sep 17 00:00:00 2001 From: r2k1 Date: Wed, 19 Feb 2025 13:36:45 +1300 Subject: [PATCH 12/19] reduce noise --- e2e/kube.go | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/e2e/kube.go b/e2e/kube.go index d19bc99ff28..3c21fd1eedd 100644 --- a/e2e/kube.go +++ b/e2e/kube.go @@ -162,21 +162,16 @@ func (k *Kubeclient) WaitUntilNodeReady(ctx context.Context, t *testing.T, vmssN // found the right node. Use it! node = castNode - nodeTaints, _ := json.Marshal(node.Spec.Taints) - nodeConditions, _ := json.Marshal(node.Status.Conditions) if len(node.Spec.Taints) > 0 { - t.Logf("node %s is tainted. Taints: %s Conditions: %s", node.Name, string(nodeTaints), string(nodeConditions)) continue } for _, cond := range node.Status.Conditions { if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue { - t.Logf("node %s is ready. Taints: %s Conditions: %s", node.Name, string(nodeTaints), string(nodeConditions)) + t.Logf("node %s is ready", node.Name) return node.Name } } - - t.Logf("node %s is not ready. Taints: %s Conditions: %s", node.Name, string(nodeTaints), string(nodeConditions)) } if node == nil { From c749b3edaacf50a0e3c0cf9c1e9dd969e957436f Mon Sep 17 00:00:00 2001 From: r2k1 Date: Wed, 19 Feb 2025 14:20:03 +1300 Subject: [PATCH 13/19] use default VMSKU --- e2e/aks_model.go | 2 +- e2e/node_config.go | 4 ++-- e2e/vmss.go | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/e2e/aks_model.go b/e2e/aks_model.go index 0058b2d4c8a..b65f3719bf7 100644 --- a/e2e/aks_model.go +++ b/e2e/aks_model.go @@ -46,7 +46,7 @@ func getBaseClusterModel(clusterName string) *armcontainerservice.ManagedCluster { Name: to.Ptr("nodepool1"), Count: to.Ptr[int32](1), - VMSize: to.Ptr("standard_d2ds_v5"), + VMSize: to.Ptr(config.Config.DefaultVMSKU), MaxPods: to.Ptr[int32](110), OSType: to.Ptr(armcontainerservice.OSTypeLinux), Type: to.Ptr(armcontainerservice.AgentPoolTypeVirtualMachineScaleSets), diff --git a/e2e/node_config.go b/e2e/node_config.go index 32b95655f46..01b12d9b7d7 100644 --- a/e2e/node_config.go +++ b/e2e/node_config.go @@ -60,7 +60,7 @@ func nbcToAKSNodeConfigV1(nbc *datamodel.NodeBootstrappingConfiguration) *aksnod Version: "v0", DisableCustomData: false, LinuxAdminUsername: "azureuser", - VmSize: "Standard_D2ds_v5", + VmSize: config.Config.DefaultVMSKU, ClusterConfig: &aksnodeconfigv1.ClusterConfig{ Location: nbc.ContainerService.Location, ResourceGroup: nbc.ResourceGroupName, @@ -347,7 +347,7 @@ func baseTemplateLinux(t *testing.T, location string, k8sVersion string, arch st }, AgentPoolProfile: &datamodel.AgentPoolProfile{ Name: "nodepool2", - VMSize: "Standard_D2ds_v5", + VMSize: config.Config.DefaultVMSKU, KubeletDiskType: "", WorkloadRuntime: "", DNSPrefix: "", diff --git a/e2e/vmss.go b/e2e/vmss.go index c1e1f70770a..35735c3404e 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -417,7 +417,7 @@ func getBaseVMSSModel(s *Scenario, customData, cseCmd string) armcompute.Virtual model := armcompute.VirtualMachineScaleSet{ Location: to.Ptr(config.Config.Location), SKU: &armcompute.SKU{ - Name: to.Ptr("Standard_D2ds_v5"), + Name: to.Ptr(config.Config.DefaultVMSKU), Capacity: to.Ptr[int64](1), }, Properties: &armcompute.VirtualMachineScaleSetProperties{ From adfed043e223d3f09fa7f1631fbc9dee549a4d4b Mon Sep 17 00:00:00 2001 From: r2k1 Date: Wed, 19 Feb 2025 14:20:28 +1300 Subject: [PATCH 14/19] improve logging --- e2e/cluster.go | 3 +++ e2e/config/vhd.go | 2 ++ e2e/scenario_helpers_test.go | 3 +-- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/e2e/cluster.go b/e2e/cluster.go index 2a88b285a5d..36ea8b07525 100644 --- a/e2e/cluster.go +++ b/e2e/cluster.go @@ -107,6 +107,7 @@ func ClusterAzureNetwork(ctx context.Context, t *testing.T) (*Cluster, error) { } func prepareCluster(ctx context.Context, t *testing.T, cluster *armcontainerservice.ManagedCluster, isAirgap, isNonAnonymousPull bool) (*Cluster, error) { + t.Logf("preparing cluster %q", *cluster.Name) ctx, cancel := context.WithTimeout(ctx, config.Config.TestTimeoutCluster) defer cancel() cluster.Name = to.Ptr(fmt.Sprintf("%s-%s", *cluster.Name, hash(cluster))) @@ -173,6 +174,8 @@ func prepareCluster(ctx context.Context, t *testing.T, cluster *armcontainerserv return nil, fmt.Errorf("get host network debug pod: %w", err) } + t.Logf("cluster %q is ready", *cluster.Name) + return &Cluster{ Model: cluster, Kube: kube, diff --git a/e2e/config/vhd.go b/e2e/config/vhd.go index 52ab5f0eea4..58e88d259e4 100644 --- a/e2e/config/vhd.go +++ b/e2e/config/vhd.go @@ -208,6 +208,7 @@ func (i *Image) String() string { func (i *Image) VHDResourceID(ctx context.Context, t *testing.T) (VHDResourceID, error) { i.vhdOnce.Do(func() { + t.Logf("finding the latest image version for %s, %s", i.Name, i.Version) switch { case i.Latest: i.vhd, i.vhdErr = Azure.LatestSIGImageVersionByTag(ctx, i, "", "") @@ -220,6 +221,7 @@ func (i *Image) VHDResourceID(ctx context.Context, t *testing.T) (VHDResourceID, i.vhdErr = fmt.Errorf("img: %s, tag %s=%s, err %w", i.Name, Config.SIGVersionTagName, Config.SIGVersionTagValue, i.vhdErr) t.Logf("failed to find the latest image version for %s", i.vhdErr) } + t.Logf("found the latest image version for %s, %s", i.Name, i.vhd) }) return i.vhd, i.vhdErr } diff --git a/e2e/scenario_helpers_test.go b/e2e/scenario_helpers_test.go index 2595583f0b9..413d8b0de18 100644 --- a/e2e/scenario_helpers_test.go +++ b/e2e/scenario_helpers_test.go @@ -180,7 +180,7 @@ func maybeSkipScenario(ctx context.Context, t *testing.T, s *Scenario) { } } - vhd, err := s.VHD.VHDResourceID(ctx, t) + _, err := s.VHD.VHDResourceID(ctx, t) if err != nil { if config.Config.IgnoreScenariosWithMissingVHD && errors.Is(err, config.ErrNotFound) { t.Skipf("skipping scenario %q: could not find image", t.Name()) @@ -188,7 +188,6 @@ func maybeSkipScenario(ctx context.Context, t *testing.T, s *Scenario) { t.Fatalf("could not find image for %q: %s", t.Name(), err) } } - t.Logf("VHD: %q, TAGS %+v", vhd, s.Tags) } func validateVM(ctx context.Context, s *Scenario) { From af1b03ff57d3cedbd6d5294e863d66728c8b1b15 Mon Sep 17 00:00:00 2001 From: r2k1 Date: Wed, 19 Feb 2025 14:21:42 +1300 Subject: [PATCH 15/19] update e2e tests --- e2e/scenario_test.go | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/e2e/scenario_test.go b/e2e/scenario_test.go index 47c91357fbb..933df5ae874 100644 --- a/e2e/scenario_test.go +++ b/e2e/scenario_test.go @@ -1665,8 +1665,10 @@ func Test_Ubuntu2404ARM(t *testing.T) { }) } -func Test_Ubuntu2204Gen2Containerd_AMDGPU_MI300(t *testing.T) { - //t.Skip("Provisioning of Standard_ND96isr_MI300X_v5 isn't reliable yet") +func Test_Ubuntu2404Gen2Containerd_AMDGPU_MI300(t *testing.T) { + t.Skip("Provisioning of Standard_ND96isr_MI300X_v5 isn't reliable yet") + //E2E_LOCATION=eastus2euap + //SUBSCRIPTION_ID=4f3dc0e4-0c77-40ff-bf9a-6ade1e3048ef RunScenario(t, &Scenario{ Description: "Tests that a GPU-enabled node using a MarinerV2 VHD can be properly bootstrapped", Tags: Tags{ @@ -1674,7 +1676,7 @@ func Test_Ubuntu2204Gen2Containerd_AMDGPU_MI300(t *testing.T) { }, Config: Config{ Cluster: ClusterKubenet, - VHD: config.VHDUbuntu2204Gen2Containerd, //TODO: add support for older + VHD: config.VHDUbuntu2404Gen2Containerd, //TODO: add support for older BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { nbc.ContainerService.Properties.AgentPoolProfiles[0].VMSize = "Standard_ND96isr_MI300X_v5" nbc.ContainerService.Properties.AgentPoolProfiles[0].Distro = "aks-cblmariner-v2-gen2" @@ -1685,7 +1687,8 @@ func Test_Ubuntu2204Gen2Containerd_AMDGPU_MI300(t *testing.T) { }, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { vmss.SKU.Name = to.Ptr("Standard_ND96isr_MI300X_v5") - vmss.Properties.VirtualMachineProfile.StorageProfile.OSDisk.DiskSizeGB = to.Ptr[int32](128) // drivers and gpu images are huge, give us some headroom + // rocm images are huge, some space for manual testing + vmss.Properties.VirtualMachineProfile.StorageProfile.OSDisk.DiskSizeGB = to.Ptr[int32](128) }, Validator: func(ctx context.Context, s *Scenario) { ValidateAMDGPU(ctx, s) @@ -1696,8 +1699,9 @@ func Test_Ubuntu2204Gen2Containerd_AMDGPU_MI300(t *testing.T) { func Test_Ubuntu2204Gen2Containerd_AMDGPU_V710(t *testing.T) { // the SKU isn't available in subscriptrion/region we run tests - //t.Skip("Provisioning of NV4ads_V710_v5 isn't reliable yet") - // LOCATION=southcentralus + t.Skip("Provisioning of NV4ads_V710_v5 isn't reliable yet") + //E2E_LOCATION=southcentralus + //SUBSCRIPTION_ID=4f3dc0e4-0c77-40ff-bf9a-6ade1e3048ef RunScenario(t, &Scenario{ Description: "Tests that a GPU-enabled node using a MarinerV2 VHD can be properly bootstrapped", Tags: Tags{ @@ -1717,7 +1721,8 @@ func Test_Ubuntu2204Gen2Containerd_AMDGPU_V710(t *testing.T) { }, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { vmss.SKU.Name = to.Ptr("Standard_NV4ads_V710_v5") - vmss.Properties.VirtualMachineProfile.StorageProfile.OSDisk.DiskSizeGB = to.Ptr[int32](128) // drivers and gpu images are huge, give us some headroom + // rocm images are huge, some space for manual testing + vmss.Properties.VirtualMachineProfile.StorageProfile.OSDisk.DiskSizeGB = to.Ptr[int32](128) }, Validator: func(ctx context.Context, s *Scenario) { ValidateAMDGPU(ctx, s) From 82081dfb855e0f5ee7e1ad3ea0078ffb50a22c7a Mon Sep 17 00:00:00 2001 From: r2k1 Date: Mon, 24 Feb 2025 10:34:46 +1300 Subject: [PATCH 16/19] update test helpers --- e2e/cluster.go | 1 - e2e/config/azure.go | 4 +++- e2e/config/vhd.go | 4 ++-- e2e/scenario_helpers_test.go | 5 ----- e2e/vmss.go | 9 +++++++++ 5 files changed, 14 insertions(+), 9 deletions(-) diff --git a/e2e/cluster.go b/e2e/cluster.go index 36ea8b07525..ccd00fa46d6 100644 --- a/e2e/cluster.go +++ b/e2e/cluster.go @@ -107,7 +107,6 @@ func ClusterAzureNetwork(ctx context.Context, t *testing.T) (*Cluster, error) { } func prepareCluster(ctx context.Context, t *testing.T, cluster *armcontainerservice.ManagedCluster, isAirgap, isNonAnonymousPull bool) (*Cluster, error) { - t.Logf("preparing cluster %q", *cluster.Name) ctx, cancel := context.WithTimeout(ctx, config.Config.TestTimeoutCluster) defer cancel() cluster.Name = to.Ptr(fmt.Sprintf("%s-%s", *cluster.Name, hash(cluster))) diff --git a/e2e/config/azure.go b/e2e/config/azure.go index 0884e39903a..2c7a0f948f8 100644 --- a/e2e/config/azure.go +++ b/e2e/config/azure.go @@ -367,7 +367,7 @@ func (a *AzureClient) assignRolesToVMIdentity(ctx context.Context, principalID * return nil } -func (a *AzureClient) LatestSIGImageVersionByTag(ctx context.Context, image *Image, tagName, tagValue string) (VHDResourceID, error) { +func (a *AzureClient) LatestSIGImageVersionByTag(ctx context.Context, t *testing.T, image *Image, tagName, tagValue string) (VHDResourceID, error) { galleryImageVersion, err := armcompute.NewGalleryImageVersionsClient(image.Gallery.SubscriptionID, a.Credential, a.ArmOptions) if err != nil { return "", fmt.Errorf("create a new images client: %v", err) @@ -409,6 +409,8 @@ func (a *AzureClient) LatestSIGImageVersionByTag(ctx context.Context, image *Ima return "", fmt.Errorf("ensuring image replication: %w", err) } + t.Logf("found the latest image version for %s, %s", image.Name, *latestVersion.Name) + return VHDResourceID(*latestVersion.ID), nil } diff --git a/e2e/config/vhd.go b/e2e/config/vhd.go index 58e88d259e4..b7e2a4c632a 100644 --- a/e2e/config/vhd.go +++ b/e2e/config/vhd.go @@ -211,11 +211,11 @@ func (i *Image) VHDResourceID(ctx context.Context, t *testing.T) (VHDResourceID, t.Logf("finding the latest image version for %s, %s", i.Name, i.Version) switch { case i.Latest: - i.vhd, i.vhdErr = Azure.LatestSIGImageVersionByTag(ctx, i, "", "") + i.vhd, i.vhdErr = Azure.LatestSIGImageVersionByTag(ctx, t, i, "", "") case i.Version != "": i.vhd, i.vhdErr = Azure.EnsureSIGImageVersion(ctx, i) default: - i.vhd, i.vhdErr = Azure.LatestSIGImageVersionByTag(ctx, i, Config.SIGVersionTagName, Config.SIGVersionTagValue) + i.vhd, i.vhdErr = Azure.LatestSIGImageVersionByTag(ctx, t, i, Config.SIGVersionTagName, Config.SIGVersionTagValue) } if i.vhdErr != nil { i.vhdErr = fmt.Errorf("img: %s, tag %s=%s, err %w", i.Name, Config.SIGVersionTagName, Config.SIGVersionTagValue, i.vhdErr) diff --git a/e2e/scenario_helpers_test.go b/e2e/scenario_helpers_test.go index 413d8b0de18..85fa042fd91 100644 --- a/e2e/scenario_helpers_test.go +++ b/e2e/scenario_helpers_test.go @@ -99,7 +99,6 @@ func RunScenario(t *testing.T, s *Scenario) { defer cancel() prepareAKSNode(ctx, s) t.Logf("Choosing the private ACR %q for the vm validation", config.GetPrivateACRName(s.Tags.NonAnonymousACR)) - logSSHInstructions(s) validateVM(ctx, s) } @@ -149,10 +148,6 @@ func prepareAKSNode(ctx context.Context, s *Scenario) { s.Runtime.KubeNodeName = s.Runtime.Cluster.Kube.WaitUntilNodeReady(ctx, s.T, s.Runtime.VMSSName) s.T.Logf("node %s is ready", s.Runtime.VMSSName) - s.Runtime.VMPrivateIP, err = getVMPrivateIPAddress(ctx, s) - require.NoError(s.T, err, "failed to get VM private IP address") - - uploadSSHKey(ctx, s) } func maybeSkipScenario(ctx context.Context, t *testing.T, s *Scenario) { diff --git a/e2e/vmss.go b/e2e/vmss.go index 35735c3404e..e406982ad11 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -35,6 +35,15 @@ const ( ) func createVMSS(ctx context.Context, s *Scenario) *armcompute.VirtualMachineScaleSet { + defer func() { + var err error + s.Runtime.VMPrivateIP, err = getVMPrivateIPAddress(ctx, s) + require.NoError(s.T, err, "failed to get VM private IP address") + + uploadSSHKey(ctx, s) + logSSHInstructions(s) + }() + cluster := s.Runtime.Cluster var nodeBootstrapping *datamodel.NodeBootstrapping ab, err := agent.NewAgentBaker() From 295f2d69fb9feac049c64f22c4d8ad5558c7c938 Mon Sep 17 00:00:00 2001 From: r2k1 Date: Mon, 24 Feb 2025 10:36:53 +1300 Subject: [PATCH 17/19] clean installation scripts --- .../linux/cloud-init/artifacts/cse_config.sh | 28 ++++++++++++++++++- parts/linux/cloud-init/artifacts/cse_main.sh | 7 +---- vhdbuilder/packer/install-dependencies.sh | 16 +++++++++-- 3 files changed, 41 insertions(+), 10 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/cse_config.sh b/parts/linux/cloud-init/artifacts/cse_config.sh index 55aebe30b0a..a47890a98e6 100755 --- a/parts/linux/cloud-init/artifacts/cse_config.sh +++ b/parts/linux/cloud-init/artifacts/cse_config.sh @@ -844,8 +844,17 @@ ensureGPUDrivers() { fi } -# TODO: this is a temporary ubuntu-only HACK until we get a driver ensureAMDGPUDrivers() { + if [[ $OS == $UBUNTU_OS_NAME ]]; then + ensureAMDGPUDriversUbuntu + else + echo "os $OS not supported at this time. skipping ensureAMDGPUDrivers" + return + fi +} + +# TODO: this is a temporary ubuntu-only HACK until we get a driver +ensureAMDGPUDriversUbuntu() { echo "Installing AMD GPU drivers" # delete amdgpu module from blacklist @@ -860,6 +869,23 @@ ensureAMDGPUDrivers() { echo "AMD GPU drivers installed" } +cleanAMDGPUDriver() { + if [[ $OS == $UBUNTU_OS_NAME ]]; then + ensureAMDGPUDriversUbuntu + else + return + fi +} + +cleanAMDGPUDriverUbuntu() { + # delete amd from a list of recognized vendors + sudo rm /etc/apt/keyrings/rocm.gpg + sudo rm /etc/apt/sources.list.d/amdgpu.list + # delete cached amd gpu packages to save disk space + sudo rm -rf /var/cache/amdgpu-apt/* +} + + disableSSH() { systemctlDisableAndStop ssh || exit $ERR_DISABLE_SSH } diff --git a/parts/linux/cloud-init/artifacts/cse_main.sh b/parts/linux/cloud-init/artifacts/cse_main.sh index e896b12a3ca..ec6d288c0b0 100755 --- a/parts/linux/cloud-init/artifacts/cse_main.sh +++ b/parts/linux/cloud-init/artifacts/cse_main.sh @@ -208,13 +208,8 @@ fi if [[ "${AMD_GPU_NODE}" = true ]] && [[ "${skip_gpu_driver_install}" != "true" ]]; then logs_to_events "AKS.CSE.ensureAMDGPUDrivers" ensureAMDGPUDrivers -else - # delete cached amd gpu packages to save disk space - sudo rm /etc/apt/keyrings/rocm.gpg - sudo rm /etc/apt/sources.list.d/amdgpu.list - sudo rm -rf /var/cache/amdgpu-apt/* fi - +cleanAMDGPUDrivers echo $(date),$(hostname), "End configuring GPU drivers" diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 1a73af51bc6..7dd413b198a 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -621,7 +621,7 @@ capture_benchmark "${SCRIPT_NAME}_overall" true process_benchmarks -download_amdgpu_drivers() { +downloadAMDGPUDriversUbuntu() { if [[ $OS != $UBUNTU_OS_NAME ]]; then echo "Skipping AMD GPU driver setup: Unsupported OS (${OS})" return @@ -641,7 +641,8 @@ download_amdgpu_drivers() { wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | \ gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null sudo chmod 0644 /etc/apt/keyrings/rocm.gpg - echo "deb [arch=amd64,i386 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/amdgpu/6.3.2/ubuntu ${DISTRO} main" \ + + echo "deb [arch=amd64,i386 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/amdgpu/6.3.3/ubuntu ${DISTRO} main" \ | sudo tee /etc/apt/sources.list.d/amdgpu.list sudo apt-get update @@ -655,4 +656,13 @@ download_amdgpu_drivers() { sudo apt-get install -o Dir::Cache::Archives="/var/cache/amdgpu-apt" --download-only --reinstall -y m4 amdgpu-dkms autoconf automake autotools-dev amdgpu-dkms-firmware } -download_amdgpu_drivers \ No newline at end of file +downloadAMDGPUDrivers() { + if [[ $OS == $UBUNTU_OS_NAME ]]; then + downloadAMDGPUDriversUbuntu + else + echo "os $OS not supported at this time. skipping ensureAMDGPUDrivers" + return + fi +} + +downloadAMDGPUDrivers \ No newline at end of file From c217da74a5621937e7351f646e437bf6234835c5 Mon Sep 17 00:00:00 2001 From: r2k1 Date: Mon, 24 Feb 2025 11:39:44 +1300 Subject: [PATCH 18/19] delete amd keys after downloading packages --- parts/linux/cloud-init/artifacts/cse_config.sh | 9 +++------ vhdbuilder/packer/install-dependencies.sh | 3 +++ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/cse_config.sh b/parts/linux/cloud-init/artifacts/cse_config.sh index a47890a98e6..eec6ad18086 100755 --- a/parts/linux/cloud-init/artifacts/cse_config.sh +++ b/parts/linux/cloud-init/artifacts/cse_config.sh @@ -857,14 +857,14 @@ ensureAMDGPUDrivers() { ensureAMDGPUDriversUbuntu() { echo "Installing AMD GPU drivers" - # delete amdgpu module from blacklist - sudo sed -i '/blacklist amdgpu/d' /etc/modprobe.d/blacklist-radeon-instinct.conf - pushd /var/cache/amdgpu-apt ls -l sudo dpkg -i *.deb popd + # delete amdgpu module from blacklist + sudo sed -i '/blacklist amdgpu/d' /etc/modprobe.d/blacklist-radeon-instinct.conf + REBOOTREQUIRED=true echo "AMD GPU drivers installed" } @@ -878,9 +878,6 @@ cleanAMDGPUDriver() { } cleanAMDGPUDriverUbuntu() { - # delete amd from a list of recognized vendors - sudo rm /etc/apt/keyrings/rocm.gpg - sudo rm /etc/apt/sources.list.d/amdgpu.list # delete cached amd gpu packages to save disk space sudo rm -rf /var/cache/amdgpu-apt/* } diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 7dd413b198a..788522dcc9d 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -654,6 +654,9 @@ downloadAMDGPUDriversUbuntu() { # Otherwise installation of some packages like "m4" is skipped because it is already installed # "m4" seems to be deleted at the later stage, making the installation fail sudo apt-get install -o Dir::Cache::Archives="/var/cache/amdgpu-apt" --download-only --reinstall -y m4 amdgpu-dkms autoconf automake autotools-dev amdgpu-dkms-firmware + # delete amd from a list of recognized vendors + sudo rm /etc/apt/keyrings/rocm.gpg + sudo rm /etc/apt/sources.list.d/amdgpu.list } downloadAMDGPUDrivers() { From 2ac40ceca47df83622a423873f2472a9a48eb24b Mon Sep 17 00:00:00 2001 From: r2k1 Date: Mon, 24 Feb 2025 15:44:07 +1300 Subject: [PATCH 19/19] install amdgpu instead of downloading pkg --- vhdbuilder/packer/install-dependencies.sh | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 788522dcc9d..ba7016fb999 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -642,18 +642,14 @@ downloadAMDGPUDriversUbuntu() { gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null sudo chmod 0644 /etc/apt/keyrings/rocm.gpg - echo "deb [arch=amd64,i386 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/amdgpu/6.3.3/ubuntu ${DISTRO} main" \ + echo "deb [arch=amd64,i386 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/amdgpu/6.3.3/ubuntu noble main" \ | sudo tee /etc/apt/sources.list.d/amdgpu.list sudo apt-get update - + sudo sed -i '/blacklist amdgpu/d' /etc/modprobe.d/blacklist-radeon-instinct.conf # Download to /var/cache/apt/archives/ sudo mkdir -p /var/cache/amdgpu-apt/ sudo chmod 777 /var/cache/amdgpu-apt/ - # Download all dependencies of the amdgpu-dkms package - # The --reinstall flag is used to ensure that the package is downloaded even if it is already installed - # Otherwise installation of some packages like "m4" is skipped because it is already installed - # "m4" seems to be deleted at the later stage, making the installation fail - sudo apt-get install -o Dir::Cache::Archives="/var/cache/amdgpu-apt" --download-only --reinstall -y m4 amdgpu-dkms autoconf automake autotools-dev amdgpu-dkms-firmware + time sudo apt-get install -y amdgpu-dkms # delete amd from a list of recognized vendors sudo rm /etc/apt/keyrings/rocm.gpg sudo rm /etc/apt/sources.list.d/amdgpu.list