Azure · r2k1 · Feb 17, 2025 · Feb 17, 2025 · Feb 17, 2025 · Feb 17, 2025
@@ -83,6 +83,7 @@ func getCSEEnv(config *aksnodeconfigv1.Configuration) map[string]string {
 		"API_SERVER_NAME":                                config.GetApiServerConfig().GetApiServerName(),
 		"IS_VHD":                                         fmt.Sprintf("%v", getIsVHD(config.IsVhd)),
 		"GPU_NODE":                                       fmt.Sprintf("%v", getEnableNvidia(config)),
+		"AMD_GPU_NODE":                                   fmt.Sprintf("%v", config.GetGpuConfig().GetEnableAmdGpu()),
 		"SGX_NODE":                                       fmt.Sprintf("%v", getIsSgxEnabledSKU(config.GetVmSize())),
 		"MIG_NODE":                                       fmt.Sprintf("%v", getIsMIGNode(config.GetGpuConfig().GetGpuInstanceProfile())),
 		"CONFIG_GPU_DRIVER_IF_NEEDED":                    fmt.Sprintf("%v", config.GetGpuConfig().GetConfigGpuDriver()),

@@ -46,7 +46,7 @@ func getBaseClusterModel(clusterName string) *armcontainerservice.ManagedCluster
 				{
 					Name:         to.Ptr("nodepool1"),
 					Count:        to.Ptr[int32](1),
-					VMSize:       to.Ptr("standard_d2ds_v5"),
+					VMSize:       to.Ptr(config.Config.DefaultVMSKU),
 					MaxPods:      to.Ptr[int32](110),
 					OSType:       to.Ptr(armcontainerservice.OSTypeLinux),
 					Type:         to.Ptr(armcontainerservice.AgentPoolTypeVirtualMachineScaleSets),

@@ -107,6 +107,7 @@ func ClusterAzureNetwork(ctx context.Context, t *testing.T) (*Cluster, error) {
 }
 
 func prepareCluster(ctx context.Context, t *testing.T, cluster *armcontainerservice.ManagedCluster, isAirgap, isNonAnonymousPull bool) (*Cluster, error) {
+	t.Logf("preparing cluster %q", *cluster.Name)
 	ctx, cancel := context.WithTimeout(ctx, config.Config.TestTimeoutCluster)
 	defer cancel()
 	cluster.Name = to.Ptr(fmt.Sprintf("%s-%s", *cluster.Name, hash(cluster)))
@@ -173,6 +174,8 @@ func prepareCluster(ctx context.Context, t *testing.T, cluster *armcontainerserv
 		return nil, fmt.Errorf("get host network debug pod: %w", err)
 	}
 
+	t.Logf("cluster %q is ready", *cluster.Name)
+
 	return &Cluster{
 		Model:         cluster,
 		Kube:          kube,

@@ -291,25 +291,27 @@ func (a *AzureClient) UploadAndGetSignedLink(ctx context.Context, blobName strin
 }
 
 func (a *AzureClient) CreateVMManagedIdentity(ctx context.Context) (string, error) {
-	identity, err := a.UserAssignedIdentities.CreateOrUpdate(ctx, ResourceGroupName, VMIdentityName, armmsi.Identity{
-		Location: to.Ptr(Config.Location),
-	}, nil)
-	if err != nil {
-		return "", fmt.Errorf("create managed identity: %w", err)
-	}
-	err = a.createBlobStorageAccount(ctx)
-	if err != nil {
-		return "", err
-	}
-	err = a.createBlobStorageContainer(ctx)
-	if err != nil {
-		return "", err
-	}
-
-	if err := a.assignRolesToVMIdentity(ctx, identity.Properties.PrincipalID); err != nil {
-		return "", err
-	}
-	return *identity.Properties.ClientID, nil
+	// HACK: temporary disable to allow running test in different subscription, without enough permissions
+	return "", nil
+	// identity, err := a.UserAssignedIdentities.CreateOrUpdate(ctx, ResourceGroupName, VMIdentityName, armmsi.Identity{
+	// 	Location: to.Ptr(Config.Location),
+	// }, nil)
+	// if err != nil {
+	// 	return "", fmt.Errorf("create managed identity: %w", err)
+	// }
+	// err = a.createBlobStorageAccount(ctx)
+	// if err != nil {
+	// 	return "", err
+	// }
+	// err = a.createBlobStorageContainer(ctx)
+	// if err != nil {
+	// 	return "", err
+	// }
+
+	// if err := a.assignRolesToVMIdentity(ctx, identity.Properties.PrincipalID); err != nil {
+	// 	return "", err
+	// }
+	// return *identity.Properties.ClientID, nil
 }
 
 func (a *AzureClient) createBlobStorageAccount(ctx context.Context) error {

@@ -208,6 +208,7 @@ func (i *Image) String() string {
 
 func (i *Image) VHDResourceID(ctx context.Context, t *testing.T) (VHDResourceID, error) {
 	i.vhdOnce.Do(func() {
+		t.Logf("finding the latest image version for %s, %s", i.Name, i.Version)
 		switch {
 		case i.Latest:
 			i.vhd, i.vhdErr = Azure.LatestSIGImageVersionByTag(ctx, i, "", "")
@@ -220,6 +221,7 @@ func (i *Image) VHDResourceID(ctx context.Context, t *testing.T) (VHDResourceID,
 			i.vhdErr = fmt.Errorf("img: %s, tag %s=%s, err %w", i.Name, Config.SIGVersionTagName, Config.SIGVersionTagValue, i.vhdErr)
 			t.Logf("failed to find the latest image version for %s", i.vhdErr)
 		}
+		t.Logf("found the latest image version for %s, %s", i.Name, i.vhd)
 	})
 	return i.vhd, i.vhdErr
 }

@@ -8,6 +8,7 @@ import (
 
 	"github.com/Azure/agentbaker/e2e/config"
 	"github.com/google/uuid"
+	"github.com/stretchr/testify/require"
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/client-go/kubernetes/scheme"
 	"k8s.io/client-go/tools/remotecommand"
@@ -54,7 +55,7 @@ type Script struct {
 	interpreter Interpreter
 }
 
-func execScriptOnVm(ctx context.Context, s *Scenario, vmPrivateIP, jumpboxPodName, sshPrivateKey string, script Script) (*podExecResult, error) {
+func execScriptOnVm(ctx context.Context, s *Scenario, script Script) (*podExecResult, error) {
 	/*
 		This works in a way that doesn't rely on the node having joined the cluster:
 		* We create a linux pod on a different node.
@@ -77,21 +78,19 @@ func execScriptOnVm(ctx context.Context, s *Scenario, vmPrivateIP, jumpboxPodNam
 	}
 
 	steps := []string{
-		fmt.Sprintf("echo '%[1]s' > %[2]s", sshPrivateKey, sshKeyName(vmPrivateIP)),
 		"set -x",
 		fmt.Sprintf("echo %[1]s > %[2]s", quoteForBash(script.script), scriptFileName),
-		fmt.Sprintf("chmod 0600 %s", sshKeyName(vmPrivateIP)),
 		fmt.Sprintf("chmod 0755 %s", scriptFileName),
-		fmt.Sprintf(`scp -i %[1]s -o PasswordAuthentication=no -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o ConnectTimeout=5 %[3]s azureuser@%[2]s:%[4]s`, sshKeyName(vmPrivateIP), vmPrivateIP, scriptFileName, remoteScriptFileName),
-		fmt.Sprintf("%s %s %s", sshString(vmPrivateIP), interpreter, remoteScriptFileName),
+		fmt.Sprintf(`scp -i %[1]s -o PasswordAuthentication=no -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o ConnectTimeout=5 %[3]s azureuser@%[2]s:%[4]s`, sshKeyName(s.Runtime.VMPrivateIP), s.Runtime.VMPrivateIP, scriptFileName, remoteScriptFileName),
+		fmt.Sprintf("%s %s %s", sshString(s.Runtime.VMPrivateIP), interpreter, remoteScriptFileName),
 	}
 
 	joinedSteps := strings.Join(steps, " && ")
 
 	s.T.Logf("Executing script %[1]s using %[2]s:\n---START-SCRIPT---\n%[3]s\n---END-SCRIPT---\n", scriptFileName, interpreter, script.script)
 
 	kube := s.Runtime.Cluster.Kube
-	execResult, err := execOnPrivilegedPod(ctx, kube, defaultNamespace, jumpboxPodName, joinedSteps)
+	execResult, err := execOnPrivilegedPod(ctx, kube, defaultNamespace, s.Runtime.Cluster.DebugPod.Name, joinedSteps)
 	if err != nil {
 		return nil, fmt.Errorf("error executing command on pod: %w", err)
 	}
@@ -172,6 +171,13 @@ func unprivilegedCommandArray() []string {
 	}
 }
 
+func uploadSSHKey(ctx context.Context, s *Scenario) {
+	cmd := fmt.Sprintf("echo '%[1]s' > %[2]s && chmod 0600 %[2]s", s.Runtime.SSHKeyPrivate, sshKeyName(s.Runtime.VMPrivateIP))
+	kube := s.Runtime.Cluster.Kube
+	_, err := execOnPrivilegedPod(ctx, kube, defaultNamespace, s.Runtime.Cluster.DebugPod.Name, cmd)
+	require.NoError(s.T, err, "error uploading ssh key to pod")
+}
+
 func logSSHInstructions(s *Scenario) {
 	result := "SSH Instructions:"
 	if !config.Config.KeepVMSS {

@@ -162,21 +162,16 @@ func (k *Kubeclient) WaitUntilNodeReady(ctx context.Context, t *testing.T, vmssN
 
 		// found the right node. Use it!
 		node = castNode
-		nodeTaints, _ := json.Marshal(node.Spec.Taints)
-		nodeConditions, _ := json.Marshal(node.Status.Conditions)
 		if len(node.Spec.Taints) > 0 {
-			t.Logf("node %s is tainted. Taints: %s Conditions: %s", node.Name, string(nodeTaints), string(nodeConditions))
 			continue
 		}
 
 		for _, cond := range node.Status.Conditions {
 			if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue {
-				t.Logf("node %s is ready. Taints: %s Conditions: %s", node.Name, string(nodeTaints), string(nodeConditions))
+				t.Logf("node %s is ready", node.Name)
 				return node.Name
 			}
 		}
-
-		t.Logf("node %s is not ready. Taints: %s Conditions: %s", node.Name, string(nodeTaints), string(nodeConditions))
 	}
 
 	if node == nil {
@@ -637,3 +632,52 @@ func nvidiaDevicePluginDaemonSet() *appsv1.DaemonSet {
 		},
 	}
 }
+
+func podEnableAMDGPUResource(s *Scenario) *corev1.Pod {
+	return &corev1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      fmt.Sprintf("%s-amdgpu-device-plugin", s.Runtime.KubeNodeName),
+			Namespace: defaultNamespace,
+		},
+		Spec: corev1.PodSpec{
+			PriorityClassName: "system-node-critical",
+			NodeSelector: map[string]string{
+				"kubernetes.io/hostname": s.Runtime.KubeNodeName,
+			},
+			Containers: []corev1.Container{
+				{
+					Name:  "amdgpu-device-plugin-container",
+					Image: "rocm/k8s-device-plugin",
+					VolumeMounts: []corev1.VolumeMount{
+						{
+							Name:      "device-plugin",
+							MountPath: "/var/lib/kubelet/device-plugins",
+						},
+						{
+							Name:      "sys",
+							MountPath: "/sys",
+						},
+					},
+				},
+			},
+			Volumes: []corev1.Volume{
+				{
+					Name: "device-plugin",
+					VolumeSource: corev1.VolumeSource{
+						HostPath: &corev1.HostPathVolumeSource{
+							Path: "/var/lib/kubelet/device-plugins",
+						},
+					},
+				},
+				{
+					Name: "sys",
+					VolumeSource: corev1.VolumeSource{
+						HostPath: &corev1.HostPathVolumeSource{
+							Path: "/sys",
+						},
+					},
+				},
+			},
+		},
+	}
+}
@@ -60,7 +60,7 @@ func nbcToAKSNodeConfigV1(nbc *datamodel.NodeBootstrappingConfiguration) *aksnod
 		Version:            "v0",
 		DisableCustomData:  false,
 		LinuxAdminUsername: "azureuser",
-		VmSize:             "Standard_D2ds_v5",
+		VmSize:             config.Config.DefaultVMSKU,
 		ClusterConfig: &aksnodeconfigv1.ClusterConfig{
 			Location:      nbc.ContainerService.Location,
 			ResourceGroup: nbc.ResourceGroupName,
@@ -347,7 +347,7 @@ func baseTemplateLinux(t *testing.T, location string, k8sVersion string, arch st
 		},
 		AgentPoolProfile: &datamodel.AgentPoolProfile{
 			Name:                "nodepool2",
-			VMSize:              "Standard_D2ds_v5",
+			VMSize:              config.Config.DefaultVMSKU,
 			KubeletDiskType:     "",
 			WorkloadRuntime:     "",
 			DNSPrefix:           "",

@@ -98,8 +98,9 @@ func RunScenario(t *testing.T, s *Scenario) {
 	ctx, cancel := context.WithTimeout(ctx, config.Config.TestTimeoutVMSS)
 	defer cancel()
 	prepareAKSNode(ctx, s)
-
 	t.Logf("Choosing the private ACR %q for the vm validation", config.GetPrivateACRName(s.Tags.NonAnonymousACR))
+	logSSHInstructions(s)
+
 	validateVM(ctx, s)
 }
 
@@ -150,6 +151,8 @@ func prepareAKSNode(ctx context.Context, s *Scenario) {
 
 	s.Runtime.VMPrivateIP, err = getVMPrivateIPAddress(ctx, s)
 	require.NoError(s.T, err, "failed to get VM private IP address")
+
+	uploadSSHKey(ctx, s)
 }
 
 func maybeSkipScenario(ctx context.Context, t *testing.T, s *Scenario) {
@@ -177,15 +180,14 @@ func maybeSkipScenario(ctx context.Context, t *testing.T, s *Scenario) {
 		}
 	}
 
-	vhd, err := s.VHD.VHDResourceID(ctx, t)
+	_, err := s.VHD.VHDResourceID(ctx, t)
 	if err != nil {
 		if config.Config.IgnoreScenariosWithMissingVHD && errors.Is(err, config.ErrNotFound) {
 			t.Skipf("skipping scenario %q: could not find image", t.Name())
 		} else {
 			t.Fatalf("could not find image for %q: %s", t.Name(), err)
 		}
 	}
-	t.Logf("VHD: %q, TAGS %+v", vhd, s.Tags)
 }
 
 func validateVM(ctx context.Context, s *Scenario) {

@@ -1664,3 +1664,69 @@ func Test_Ubuntu2404ARM(t *testing.T) {
 		},
 	})
 }
+
+func Test_Ubuntu2404Gen2Containerd_AMDGPU_MI300(t *testing.T) {
+	t.Skip("Provisioning of Standard_ND96isr_MI300X_v5 isn't reliable yet")
+	//E2E_LOCATION=eastus2euap
+	//SUBSCRIPTION_ID=4f3dc0e4-0c77-40ff-bf9a-6ade1e3048ef
+	RunScenario(t, &Scenario{
+		Description: "Tests that a GPU-enabled node using a MarinerV2 VHD can be properly bootstrapped",
+		Tags: Tags{
+			GPU: true,
+		},
+		Config: Config{
+			Cluster: ClusterKubenet,
+			VHD:     config.VHDUbuntu2404Gen2Containerd, //TODO: add support for older
+			BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) {
+				nbc.ContainerService.Properties.AgentPoolProfiles[0].VMSize = "Standard_ND96isr_MI300X_v5"
+				nbc.ContainerService.Properties.AgentPoolProfiles[0].Distro = "aks-cblmariner-v2-gen2"
+				nbc.AgentPoolProfile.VMSize = "Standard_ND96isr_MI300X_v5"
+				nbc.AgentPoolProfile.Distro = "aks-cblmariner-v2-gen2"
+				nbc.EnableAMDGPU = true
+				nbc.ConfigGPUDriverIfNeeded = true
+			},
+			VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
+				vmss.SKU.Name = to.Ptr("Standard_ND96isr_MI300X_v5")
+				// rocm images are huge, some space for manual testing
+				vmss.Properties.VirtualMachineProfile.StorageProfile.OSDisk.DiskSizeGB = to.Ptr[int32](128)
+			},
+			Validator: func(ctx context.Context, s *Scenario) {
+				ValidateAMDGPU(ctx, s)
+			},
+		},
+	})
+}
+
+func Test_Ubuntu2204Gen2Containerd_AMDGPU_V710(t *testing.T) {
+	// the SKU isn't available in subscriptrion/region we run tests
+	t.Skip("Provisioning of NV4ads_V710_v5 isn't reliable yet")
+	//E2E_LOCATION=southcentralus
+	//SUBSCRIPTION_ID=4f3dc0e4-0c77-40ff-bf9a-6ade1e3048ef
+	RunScenario(t, &Scenario{
+		Description: "Tests that a GPU-enabled node using a MarinerV2 VHD can be properly bootstrapped",
+		Tags: Tags{
+			GPU: true,
+		},
+		Config: Config{
+			Cluster: ClusterKubenet,
+			VHD:     config.VHDUbuntu2204Gen2Containerd,
+			BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) {
+				nbc.ContainerService.Properties.AgentPoolProfiles[0].VMSize = "Standard_NV4ads_V710_v5"
+				nbc.ContainerService.Properties.AgentPoolProfiles[0].Distro = "aks-cblmariner-v2-gen2"
+				nbc.AgentPoolProfile.VMSize = "Standard_NV4ads_V710_v5"
+				nbc.AgentPoolProfile.Distro = "aks-cblmariner-v2-gen2"
+				nbc.EnableAMDGPU = true
+				nbc.ConfigGPUDriverIfNeeded = true
+
+			},
+			VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
+				vmss.SKU.Name = to.Ptr("Standard_NV4ads_V710_v5")
+				// rocm images are huge, some space for manual testing
+				vmss.Properties.VirtualMachineProfile.StorageProfile.OSDisk.DiskSizeGB = to.Ptr[int32](128)
+			},
+			Validator: func(ctx context.Context, s *Scenario) {
+				ValidateAMDGPU(ctx, s)
+			},
+		},
+	})
+}
@@ -38,11 +38,6 @@ func ValidateCommonLinux(ctx context.Context, s *Scenario) {
 	stdout := execResult.stdout.String()
 	require.NotContains(s.T, stdout, "--dynamic-config-dir", "kubelet flag '--dynamic-config-dir' should not be present in /etc/default/kubelet\nContents:\n%s")
 
-	// the instructions belows expects the SSH key to be uploaded to the user pool VM.
-	// which happens as a side-effect of execCommandOnVMForScenario, it's ugly but works.
-	// maybe we should use a single ssh key per cluster, but need to be careful with parallel test runs.
-	logSSHInstructions(s)
-
 	ValidateSysctlConfig(ctx, s, map[string]string{
 		"net.ipv4.tcp_retries2":             "8",
 		"net.core.message_burst":            "80",

@@ -174,7 +174,7 @@ func execScriptOnVMForScenario(ctx context.Context, s *Scenario, cmd string) *po
 		script.interpreter = Bash
 	}
 
-	result, err := execScriptOnVm(ctx, s, s.Runtime.VMPrivateIP, s.Runtime.Cluster.DebugPod.Name, string(s.Runtime.SSHKeyPrivate), script)
+	result, err := execScriptOnVm(ctx, s, script)
 	require.NoError(s.T, err, "failed to execute command on VM")
 	return result
 }
@@ -334,7 +334,7 @@ func waitUntilResourceAvailable(ctx context.Context, s *Scenario, resourceName s
 	nodeName := s.Runtime.KubeNodeName
 	ticker := time.NewTicker(time.Second)
 	defer ticker.Stop()
-
+	s.T.Logf("waiting for resource %q to be available on node %q", resourceName, nodeName)
 	for {
 		select {
 		case <-ctx.Done():
@@ -427,3 +427,15 @@ func GetFieldFromJsonObjectOnNode(ctx context.Context, s *Scenario, fileName str
 
 	return podExecResult.stdout.String()
 }
+
+func ValidateAMDGPU(ctx context.Context, s *Scenario) {
+	s.T.Logf("validating pod using AMD GPU")
+
+	execResult := execScriptOnVMForScenario(ctx, s, "lspci -k")
+	require.Equal(s.T, "0", execResult.exitCode, "expected to find lspci command, but did not")
+	assert.Contains(s.T, execResult.stdout.String(), "amdgpu", "expected to see amdgpu kernel module managing a PCI device, but did not")
+
+	ensurePod(ctx, s, podEnableAMDGPUResource(s))
+	waitUntilResourceAvailable(ctx, s, "amd.com/gpu")
+	//ensureJob(ctx, s, jobAMDGPUWorkload(s))
+}