Skip to content

Commit

Permalink
test: E2E Tweaks (#5857)
Browse files Browse the repository at this point in the history
  • Loading branch information
r2k1 authored Feb 17, 2025
1 parent 23c200e commit c44de42
Show file tree
Hide file tree
Showing 9 changed files with 86 additions and 26 deletions.
9 changes: 8 additions & 1 deletion e2e/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@ import (
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v6"
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v6"
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources"
"github.com/google/uuid"
"github.com/stretchr/testify/require"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"github.com/google/uuid"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/tools/clientcmd"
)
Expand Down Expand Up @@ -57,6 +57,7 @@ type Cluster struct {
SubnetID string
ClusterParams *ClusterParams
Maintenance *armcontainerservice.MaintenanceConfiguration
DebugPod *corev1.Pod
}

// Returns true if the cluster is configured with Azure CNI
Expand Down Expand Up @@ -167,12 +168,18 @@ func prepareCluster(ctx context.Context, t *testing.T, cluster *armcontainerserv
return nil, fmt.Errorf("extracting cluster parameters: %w", err)
}

hostPod, err := kube.GetHostNetworkDebugPod(ctx, t)
if err != nil {
return nil, fmt.Errorf("get host network debug pod: %w", err)
}

return &Cluster{
Model: cluster,
Kube: kube,
SubnetID: subnetID,
Maintenance: maintenance,
ClusterParams: clusterParams,
DebugPod: hostPod,
}, nil
}

Expand Down
57 changes: 57 additions & 0 deletions e2e/config/azure.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"net/http"
"os"
"strings"
"testing"
"time"

"github.com/Azure/azure-sdk-for-go/sdk/azcore"
Expand Down Expand Up @@ -491,3 +492,59 @@ func ensureProvisioningState(version *armcompute.GalleryImageVersion) error {
}
return nil
}

func (a *AzureClient) CreateVMSSWithRetry(ctx context.Context, t *testing.T, resourceGroupName string, vmssName string, parameters armcompute.VirtualMachineScaleSet) (*armcompute.VirtualMachineScaleSet, error) {
t.Logf("creating VMSS %s in resource group %s", vmssName, resourceGroupName)
delay := 5 * time.Second
retryOn := func(err error) bool {
var respErr *azcore.ResponseError
// AllocationFailed sometimes happens for exotic SKUs (new GPUs) with limited availability, sometimes retrying helps
// It's not a quota issue
return errors.As(err, &respErr) && respErr.StatusCode == 200 && respErr.ErrorCode == "AllocationFailed"
}
attempt := 0
for {
attempt++
vmss, err := a.createVMSS(ctx, resourceGroupName, vmssName, parameters)
if err == nil {
t.Logf("created VMSS %s in resource group %s", vmssName, resourceGroupName)
return vmss, nil
}

// not a retryable error
if !retryOn(err) {
return nil, err
}

if attempt >= 10 {
return nil, fmt.Errorf("failed to create VMSS after 10 retries: %w", err)
}

t.Logf("failed to create VMSS: %v, attempt: %v, retrying in %v", err, attempt, delay)
select {
case <-ctx.Done():
return nil, err
case <-time.After(delay):
}
}

}

func (a *AzureClient) createVMSS(ctx context.Context, resourceGroupName string, vmssName string, parameters armcompute.VirtualMachineScaleSet) (*armcompute.VirtualMachineScaleSet, error) {
operation, err := a.VMSS.BeginCreateOrUpdate(
ctx,
resourceGroupName,
vmssName,
parameters,
nil,
)
if err != nil {
return nil, err
}
vmssResp, err := operation.PollUntilDone(ctx, DefaultPollUntilDoneOptions)
if err != nil {
return nil, err
}
return &vmssResp.VirtualMachineScaleSet, nil

}
2 changes: 1 addition & 1 deletion e2e/exec.go
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ func logSSHInstructions(s *Scenario) {
result += "\n========================\n"
result += fmt.Sprintf("az account set --subscription %s\n", config.Config.SubscriptionID)
result += fmt.Sprintf("az aks get-credentials --resource-group %s --name %s --overwrite-existing\n", config.ResourceGroupName, *s.Runtime.Cluster.Model.Name)
result += fmt.Sprintf(`kubectl exec -it %s -- bash -c "chroot /proc/1/root /bin/bash -c '%s'"`, s.Runtime.DebugHostPod, sshString(s.Runtime.VMPrivateIP))
result += fmt.Sprintf(`kubectl exec -it %s -- bash -c "chroot /proc/1/root /bin/bash -c '%s'"`, s.Runtime.Cluster.DebugPod.Name, sshString(s.Runtime.VMPrivateIP))
s.T.Log(result)
//runtime.Breakpoint() // uncomment to pause the test
}
17 changes: 13 additions & 4 deletions e2e/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ package e2e

import (
"context"
"strings"
"testing"
"time"

"github.com/Azure/azure-sdk-for-go/sdk/azcore/to"
Expand All @@ -12,10 +14,7 @@ import (

func ensurePod(ctx context.Context, s *Scenario, pod *corev1.Pod) {
kube := s.Runtime.Cluster.Kube
if len(pod.Name) > 63 {
pod.Name = pod.Name[:63]
s.T.Logf("truncated pod name to %q", pod.Name)
}
truncatePodName(s.T, pod)
s.T.Logf("creating pod %q", pod.Name)
_, err := kube.Typed.CoreV1().Pods(pod.Namespace).Create(ctx, pod, metav1.CreateOptions{})
require.NoErrorf(s.T, err, "failed to create pod %q", pod.Name)
Expand All @@ -32,3 +31,13 @@ func ensurePod(ctx context.Context, s *Scenario, pod *corev1.Pod) {
_, err = kube.WaitUntilPodRunning(ctx, s.T, pod.Namespace, "", "metadata.name="+pod.Name)
require.NoErrorf(s.T, err, "failed to wait for pod %q to be in running state", pod.Name)
}

func truncatePodName(t *testing.T, pod *corev1.Pod) {
name := pod.Name
if len(pod.Name) < 63 {
return
}
pod.Name = pod.Name[:63]
pod.Name = strings.TrimRight(pod.Name, "-")
t.Logf("truncated pod name %q to %q", name, pod.Name)
}
8 changes: 3 additions & 5 deletions e2e/scenario_helpers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,13 @@ func setupSignalHandler() context.Context {
go func() {
// block until signal is received
<-ch
fmt.Println(red("Received cancellation signal, gracefully shutting down the test suite. Cancel again to force exit."))
fmt.Println(red("Received cancellation signal, gracefully shutting down the test suite. Cancel again to force exit. (Created Azure resources will not be deleted in this case)"))
cancel()

// block until second signal is received
<-ch
fmt.Println(red("Received second cancellation signal, forcing exit."))
msg := fmt.Sprintf("Received second cancellation signal, forcing exit.\nPlease check https://ms.portal.azure.com/#@microsoft.onmicrosoft.com/resource/subscriptions/%s/resourceGroups/%s/overview and delete any resources created by the test suite", config.Config.SubscriptionID, config.ResourceGroupName)
fmt.Println(red(msg))
os.Exit(1)
}()
return ctx
Expand Down Expand Up @@ -149,9 +150,6 @@ func prepareAKSNode(ctx context.Context, s *Scenario) {

s.Runtime.VMPrivateIP, err = getVMPrivateIPAddress(ctx, s)
require.NoError(s.T, err, "failed to get VM private IP address")
hostPod, err := s.Runtime.Cluster.Kube.GetHostNetworkDebugPod(ctx, s.T)
require.NoError(s.T, err, "failed to get host network debug pod name")
s.Runtime.DebugHostPod = hostPod.Name
}

func maybeSkipScenario(ctx context.Context, t *testing.T, s *Scenario) {
Expand Down
1 change: 0 additions & 1 deletion e2e/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,6 @@ type ScenarioRuntime struct {
SSHKeyPublic []byte
SSHKeyPrivate []byte
VMPrivateIP string
DebugHostPod string
}

// Config represents the configuration of an AgentBaker E2E scenario.
Expand Down
2 changes: 1 addition & 1 deletion e2e/validators.go
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ func execScriptOnVMForScenario(ctx context.Context, s *Scenario, cmd string) *po
script.interpreter = Bash
}

result, err := execScriptOnVm(ctx, s, s.Runtime.VMPrivateIP, s.Runtime.DebugHostPod, string(s.Runtime.SSHKeyPrivate), script)
result, err := execScriptOnVm(ctx, s, s.Runtime.VMPrivateIP, s.Runtime.Cluster.DebugPod.Name, string(s.Runtime.SSHKeyPrivate), script)
require.NoError(s.T, err, "failed to execute command on VM")
return result
}
Expand Down
16 changes: 3 additions & 13 deletions e2e/vmss.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,24 +74,14 @@ func createVMSS(ctx context.Context, s *Scenario) *armcompute.VirtualMachineScal

s.PrepareVMSSModel(ctx, s.T, &model)

operation, err := config.Azure.VMSS.BeginCreateOrUpdate(
ctx,
*cluster.Model.Properties.NodeResourceGroup,
s.Runtime.VMSSName,
model,
nil,
)
skipTestIfSKUNotAvailableErr(s.T, err)
require.NoError(s.T, err)
vmss, err := config.Azure.CreateVMSSWithRetry(ctx, s.T, *cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, model)
s.T.Cleanup(func() {
cleanupVMSS(ctx, s)
})

vmssResp, err := operation.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions)

skipTestIfSKUNotAvailableErr(s.T, err)
// fail test, but continue to extract debug information
require.NoError(s.T, err, "create vmss %q, check %s for vm logs", s.Runtime.VMSSName, testDir(s.T))
return &vmssResp.VirtualMachineScaleSet
return vmss
}

func skipTestIfSKUNotAvailableErr(t *testing.T, err error) {
Expand Down
Empty file removed vhdbuilder/release-notes/go.mod
Empty file.

0 comments on commit c44de42

Please sign in to comment.