diff --git a/.dockerignore b/.dockerignore index d298dcaad3..ea89279094 100644 --- a/.dockerignore +++ b/.dockerignore @@ -28,6 +28,9 @@ bin /site/public /test +# Allow upgrade test directory +!/test/upgrade + # Created by .ignore support plugin (hsz.mobi) ### Go template # Binaries for programs and plugins diff --git a/cloudbuild.yaml b/cloudbuild.yaml index 9f8003014b..29f08f5452 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -233,9 +233,18 @@ steps: # End to end tests # - # wait for us to be the oldest ongoing build before we run e2es - - name: gcr.io/cloud-builders/gcloud - id: e2e-wait-to-become-leader + # Build and Push upgrade test + - name: make-docker + id: push-upgrade-test + dir: test/upgrade + env: ['REGISTRY=${_REGISTRY}'] + args: [push] + waitFor: + - push-images + + # Wait for us to be the oldest ongoing build before we run upgrade and e2e tests + - name: gcr.io/google.com/cloudsdktool/cloud-sdk + id: wait-to-become-leader waitFor: [push-images] script: | #!/usr/bin/env bash @@ -258,10 +267,157 @@ steps: - BUILD_ID=$BUILD_ID - TRIGGER_NAME=$TRIGGER_NAME + # Run the upgrade tests parallel, fail this step if any of the tests fail + - name: gcr.io/google.com/cloudsdktool/cloud-sdk + id: submit-upgrade-test-cloud-build + dir: test/upgrade + entrypoint: bash + args: + - -c + - | + #!/usr/bin/env bash + set -e + set -o pipefail + export KUBECONFIG="/root/.kube/config" + mkdir -p /go/src/agones.dev/ /root/.kube/ + ln -s /workspace /go/src/agones.dev/agones + cd /go/src/agones.dev/agones/test/upgrade + + pids=() + typeset -A waitPids # Associative array for mapping `kubectl wait job` pid -> `kubectl wait job` output log name + tmpdir=$(mktemp -d) + trap 'rm -rf -- "$tmpdir"' EXIT SIGTERM + + # Update image tags to include the current build version. + DevVersion="${_BASE_VERSION}-dev-$(git rev-parse --short=7 HEAD)" + export DevVersion + sed "s/\${DevVersion}/${DevVersion}/" upgradeTest.yaml > "${tmpdir}"/upgradeTest.yaml + sed "s/\${DevVersion}/${DevVersion}/" versionMap.yaml > "${tmpdir}"/versionMap.yaml + + # Kill all currently running child processes on exit or if a non-zero signal is seen + trap 'echo Cleaning up any remaining running pids: $(jobs -p) ; kill $(jobs -p) 2> /dev/null || :' EXIT SIGTERM + + cloudProducts=("generic" "gke-autopilot") + declare -A versionsAndRegions=( [1.31]=us-east1 [1.30]=us-central1 [1.29]=us-west1 ) + + for cloudProduct in "${cloudProducts[@]}" + do + for version in "${!versionsAndRegions[@]}" + do + region=${versionsAndRegions[$version]} + if [ "$cloudProduct" = generic ] + then + testCluster="standard-upgrade-test-cluster-${version//./-}" + else + testCluster="gke-autopilot-upgrade-test-cluster-${version//./-}" + fi + testClusterLocation="${region}" + + gcloud container clusters get-credentials "$testCluster" --region="$testClusterLocation" --project="$PROJECT_ID" + + if [ "$cloudProduct" = gke-autopilot ] ; then + # For autopilot clusters use evictable "balloon" pods to keep a buffer in node pool autoscaling. + kubectl apply -f evictablePods.yaml + fi + + # Clean up any existing job / namespace / apiservice from previous run + echo Checking if resources from a previous build of upgrade-test-runner exist and need to be cleaned up on cluster "${testCluster}". + if kubectl get jobs | grep upgrade-test-runner ; then + echo Deleting job from previous run of upgrade-test-runner on cluster "${testCluster}". + kubectl delete job upgrade-test-runner + kubectl wait --for=delete pod -l job-name=upgrade-test-runner --timeout=5m + fi + + # Check if there are any dangling game servers. + if kubectl get gs | grep ".*"; then + # Remove any finalizers so that dangling game servers can be manually deleted. + kubectl get gs -o=custom-columns=:.metadata.name --no-headers | xargs kubectl patch gs -p '{"metadata":{"finalizers":[]}}' --type=merge + sleep 5 + echo Deleting game servers from previous run of upgrade-test-runner on cluster "${testCluster}". + kubectl delete gs -l app=sdk-client-test + fi + + if kubectl get po -l app=sdk-client-test | grep ".*"; then + echo Deleting pods from previous run of upgrade-test-runner on cluster "${testCluster}". + kubectl delete po -l app=sdk-client-test + kubectl wait --for=delete pod -l app=sdk-client-test --timeout=5m + fi + + # The v1.allocation.agones.dev apiservice does not get removed automatically and will prevent the namespace from terminating. + if kubectl get apiservice | grep v1.allocation.agones.dev ; then + echo Deleting v1.allocation.agones.dev from previous run of upgrade-test-runner on cluster "${testCluster}". + kubectl delete apiservice v1.allocation.agones.dev + fi + + if kubectl get namespace | grep agones-system ; then + echo Deleting agones-system namespace from previous run of upgrade-test-runner on cluster "${testCluster}". + kubectl delete namespace agones-system + kubectl wait --for=delete ns agones-system --timeout=5m + fi + + if kubectl get crds | grep agones ; then + echo Deleting crds from previous run of upgrade-test-runner on cluster "${testCluster}". + kubectl get crds -o=custom-columns=:.metadata.name | grep agones | xargs kubectl delete crd + fi + + echo kubectl apply -f permissions.yaml on cluster "${testCluster}" + kubectl apply -f permissions.yaml + echo kubectl apply -f versionMap.yaml on cluster "${testCluster}" + kubectl apply -f "${tmpdir}"/versionMap.yaml + echo kubectl apply -f gameserverTemplate.yaml on cluster "${testCluster}" + kubectl apply -f gameserverTemplate.yaml + + echo kubectl apply -f upgradeTest.yaml on cluster "${testCluster}" + kubectl apply -f "${tmpdir}"/upgradeTest.yaml + + # We need to wait for job pod to be created and ready before we can wait on the job itself. + # TODO: Once all test clusters are at Kubernetes Version >= 1.31 use `kubectl wait --for=create` instead of sleep. + # kubectl wait --for=create pod -l job-name=upgrade-test-runner --timeout=1m + sleep 10s + kubectl wait --for=condition=ready pod -l job-name=upgrade-test-runner --timeout=5m + + echo Wait for job upgrade-test-runner to complete or fail on cluster "${testCluster}" + kubectl wait job/upgrade-test-runner --timeout=20m --for jsonpath='{.status.conditions[*].status}'=True -o jsonpath='{.status.conditions[*].type}' | tee "${tmpdir}"/"${testCluster}".log & + waitPid=$! + pids+=( "$waitPid" ) + waitPids[$waitPid]="${tmpdir}"/"${testCluster}".log + done + done + + for pid in "${pids[@]}"; do + # This block executes when the process exits and pid status==0 + if wait $pid; then + outputLog="${waitPids[$pid]}" + # wait for output to finish writing to file + until [ -s "$outputLog" ]; do sleep 1; done + output=$(<"${outputLog}") + echo "${outputLog}": "${output}" + + # "Complete" is successful job run. + # Version 1.31 has "SuccessCriteriaMet" as the first completion status returned, or "FailureTarget" in case of failure. + if [ "$output" == "Complete" ] || [ "$output" == "SuccessCriteriaMet" ] ; then + continue + else + exit 1 + fi + # This block executes when the process exits and pid status!=0 + else + status=$? + outputLog="${waitPids[$pid]}" + echo "One of the upgrade tests pid $pid from cluster log $outputLog exited with a non-zero status ${status}." + exit $status + fi + done + echo "End of Upgrade Tests" + + waitFor: + - wait-to-become-leader + - push-upgrade-test + # cancel all the orphan e2e test cloud builds, fail to cancel any of the build will fail this whole build - name: gcr.io/cloud-builders/gcloud id: cancel-orphan-e2e-tests - waitFor: [e2e-wait-to-become-leader] + waitFor: [wait-to-become-leader] script: | #!/usr/bin/env bash until gcloud builds list --ongoing --filter "tags:'e2e-test'" --format="value(id)" | xargs --no-run-if-empty gcloud builds cancel @@ -386,7 +542,7 @@ steps: # - name: gcr.io/cloud-builders/gcloud id: cleanup-services - waitFor: [e2e-wait-to-become-leader] + waitFor: [wait-to-become-leader] allowFailure: true entrypoint: bash args: @@ -400,6 +556,7 @@ steps: done substitutions: + _BASE_VERSION: 1.46.0 _CACHE_BUCKET: agones-build-cache _HTMLTEST_CACHE_KEY: htmltest-0.10.1 _CPP_SDK_BUILD_CACHE_KEY: cpp-sdk-build @@ -407,7 +564,7 @@ substitutions: _RUST_SDK_BUILD_CACHE_KEY: rust-sdk-build _REGISTRY: us-docker.pkg.dev/${PROJECT_ID}/ci tags: [ci, 'commit-${COMMIT_SHA}'] -timeout: 18000s # 5h: 3h (e2e-wait-to-become-leader) + 1.5h (e2e timeout) + 0.5h (everything else) +timeout: 18000s # 5h: 3h (wait-to-become-leader) + 1.5h (e2e timeout) + 0.5h (everything else) queueTtl: 259200s # 72h images: - ${_REGISTRY}/agones-controller diff --git a/test/sdk/go/Makefile b/test/sdk/go/Makefile index 45b8d7726a..86ba5dba7b 100644 --- a/test/sdk/go/Makefile +++ b/test/sdk/go/Makefile @@ -29,7 +29,7 @@ project_path := $(dir $(mkfile_path)) root_path = $(realpath $(project_path)/) # Because go mod init in the Dockerfile installs the most recently released version of Agones, this # will need to be built and pushed post-release. During DEV it will be built at DEV - 1. -release_version = 1.44.0 +release_version = 1.45.0 server_tag := $(REGISTRY)/sdk-client-test:$(release_version) # _____ _ diff --git a/test/upgrade/Dockerfile b/test/upgrade/Dockerfile index 52aa6a5e3f..68583bfae4 100644 --- a/test/upgrade/Dockerfile +++ b/test/upgrade/Dockerfile @@ -12,22 +12,23 @@ # See the License for the specific language governing permissions and # limitations under the License. -FROM gcr.io/cloud-builders/gcloud AS builder +FROM golang:1.22.9-alpine AS builder -RUN apt-get update && \ - apt-get install -y curl && \ - apt-get clean +# install curl +RUN apk update && \ + apk upgrade && \ + apk --no-cache add curl WORKDIR /usr/local # install kubectl -ENV KUBECTL_VER=1.29.7 +ENV KUBECTL_VER=1.30.4 RUN curl -LO https://storage.googleapis.com/kubernetes-release/release/v${KUBECTL_VER}/bin/linux/amd64/kubectl && \ chmod go+rx ./kubectl && \ mv ./kubectl /usr/local/bin/kubectl # install Helm package manager -ENV HELM_VER=3.14.3 +ENV HELM_VER=3.16.3 ENV HELM_URL=https://get.helm.sh/helm-v${HELM_VER}-linux-amd64.tar.gz RUN curl -L ${HELM_URL} > /tmp/helm.tar.gz \ && tar -zxvf /tmp/helm.tar.gz -C /tmp \ @@ -35,27 +36,20 @@ RUN curl -L ${HELM_URL} > /tmp/helm.tar.gz \ && chmod go+rx /usr/local/bin/helm \ && rm /tmp/helm.tar.gz && rm -rf /tmp/linux-amd64 -# Build the Go image from source -FROM golang:1.22.6 AS build-stage - +# Copy and build the Go application WORKDIR /agones.dev - -COPY *.go ./ - +COPY test/upgrade/main.go ./ RUN go mod init agones.dev/agones/test/upgrade/testContainer RUN go mod tidy RUN go mod download - RUN CGO_ENABLED=0 GOOS=linux go build -o /upgrade-test -# Copy the above binary into a lean image -FROM gcr.io/distroless/static-debian12:nonroot AS build-release-stage - +# Copy the dev build Agones Helm chart WORKDIR / -COPY --from=build-stage /upgrade-test /upgrade-test -COPY --from=builder /usr/local /usr/local - -USER nonroot:nonroot +# Use a non-root user for security best practices +RUN adduser -D -g '' adduser +USER adduser +COPY --chown=adduser install/helm/agones /install/helm ENTRYPOINT ["/upgrade-test"] diff --git a/test/upgrade/Makefile b/test/upgrade/Makefile index 4b6bfe5a0a..e7c014412f 100644 --- a/test/upgrade/Makefile +++ b/test/upgrade/Makefile @@ -24,12 +24,11 @@ # REGISTRY ?= -mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST))) -project_path := $(dir $(mkfile_path)) -root_path = $(realpath $(project_path)/) -dev_version = 1.44.0-dev -server_tag := $(REGISTRY)/upgrade-test-controller:$(dev_version) - +base_version = 1.46.0 +# Version defaults to the short hash of the latest commit +VERSION ?= $(base_version)-dev-$(shell git rev-parse --short=7 HEAD) +server_tag := $(REGISTRY)/upgrade-test-controller:$(VERSION) +cwd:=$(shell dirname $(realpath $(lastword $(MAKEFILE_LIST)))) # _____ _ # |_ _|_ _ _ __ __ _ ___| |_ ___ # | |/ _` | '__/ _` |/ _ \ __/ __| @@ -37,9 +36,12 @@ server_tag := $(REGISTRY)/upgrade-test-controller:$(dev_version) # |_|\__,_|_| \__, |\___|\__|___/ # |___/ +# Using .ONESHELL allows us to `cd` to the parent directory agones. This gives the Dockerfile the +# context of the agones directory, which allows it to COPY files from any child directory. +.ONESHELL: # Build a docker image for the server, and tag it build: - cd $(root_path) && docker build -f $(project_path)Dockerfile --tag=$(server_tag) . + cd "$(cwd)/../.." && DOCKER_BUILDKIT=1 docker build -f $(cwd)/Dockerfile --tag=$(server_tag) . push: build docker push $(server_tag) diff --git a/test/upgrade/evictablePods.yaml b/test/upgrade/evictablePods.yaml new file mode 100644 index 0000000000..59a6765f4b --- /dev/null +++ b/test/upgrade/evictablePods.yaml @@ -0,0 +1,67 @@ +# Copyright 2024 Google LLC All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Create evictable pods to prevent Autopilot clusters from completely scaling down. +# https://cloud.google.com/kubernetes-engine/docs/how-to/capacity-provisioning +--- +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: low-priority +value: -10 +preemptionPolicy: Never +globalDefault: false +description: "Low priority workloads" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: evictable-pods-deployment +spec: + replicas: 200 + selector: + matchLabels: + app: evictable-pods + template: + metadata: + labels: + app: evictable-pods + # Label for use with packed game server pod affinity rules + agones.dev/role: gameserver + spec: + priorityClassName: low-priority + terminationGracePeriodSeconds: 0 + containers: + - name: ubuntu + image: ubuntu + imagePullPolicy: IfNotPresent + command: ["sleep"] + args: ["infinity"] + resources: + requests: + memory: 52Mi + cpu: 30m + limits: + memory: 52Mi + cpu: 30m + # Use same affinity as packed game server pods + affinity: + podAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchLabels: + agones.dev/role: gameserver + topologyKey: kubernetes.io/hostname + weight: 100 diff --git a/test/upgrade/gameserverTemplate.yaml b/test/upgrade/gameserverTemplate.yaml index 407ed218aa..f93c6088bc 100644 --- a/test/upgrade/gameserverTemplate.yaml +++ b/test/upgrade/gameserverTemplate.yaml @@ -51,16 +51,23 @@ data: metadata: labels: agonesVersion: {{ .AgonesVersion }} + app: sdk-client-test spec: containers: - name: sdk-client-test image: "{{ .Registry }}:{{ .AgonesVersion }}" imagePullPolicy: Always + env: + - name: SHUTDOWN_DELAY_SECONDS + value: "10" + - name: GRACEFUL_TERMINATION_DELAY_SECONDS + value: "10" resources: requests: - memory: 64Mi + memory: 52Mi cpu: 20m limits: - memory: 64Mi + memory: 52Mi cpu: 20m serviceAccountName: agones-sa + restartPolicy: Never diff --git a/test/upgrade/go.mod b/test/upgrade/go.mod deleted file mode 100644 index f7615ef1da..0000000000 --- a/test/upgrade/go.mod +++ /dev/null @@ -1,51 +0,0 @@ -module agones.dev/agones/test/upgrade/testContainer - -go 1.22 - -toolchain go1.22.6 - -require ( - k8s.io/apimachinery v0.31.0 - k8s.io/client-go v0.31.0 -) - -require ( - github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect - github.com/emicklei/go-restful/v3 v3.11.0 // indirect - github.com/fxamacker/cbor/v2 v2.7.0 // indirect - github.com/go-logr/logr v1.4.2 // indirect - github.com/go-openapi/jsonpointer v0.19.6 // indirect - github.com/go-openapi/jsonreference v0.20.2 // indirect - github.com/go-openapi/swag v0.22.4 // indirect - github.com/gogo/protobuf v1.3.2 // indirect - github.com/golang/protobuf v1.5.4 // indirect - github.com/google/gnostic-models v0.6.8 // indirect - github.com/google/go-cmp v0.6.0 // indirect - github.com/google/gofuzz v1.2.0 // indirect - github.com/google/uuid v1.6.0 // indirect - github.com/josharian/intern v1.0.0 // indirect - github.com/json-iterator/go v1.1.12 // indirect - github.com/mailru/easyjson v0.7.7 // indirect - github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect - github.com/modern-go/reflect2 v1.0.2 // indirect - github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect - github.com/onsi/gomega v1.33.1 // indirect - github.com/x448/float16 v0.8.4 // indirect - golang.org/x/net v0.26.0 // indirect - golang.org/x/oauth2 v0.21.0 // indirect - golang.org/x/sys v0.21.0 // indirect - golang.org/x/term v0.21.0 // indirect - golang.org/x/text v0.16.0 // indirect - golang.org/x/time v0.3.0 // indirect - google.golang.org/protobuf v1.34.2 // indirect - gopkg.in/inf.v0 v0.9.1 // indirect - gopkg.in/yaml.v2 v2.4.0 // indirect - gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/api v0.31.0 // indirect - k8s.io/klog/v2 v2.130.1 // indirect - k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 // indirect - k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 // indirect - sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect - sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect - sigs.k8s.io/yaml v1.4.0 // indirect -) diff --git a/test/upgrade/main.go b/test/upgrade/main.go index 6ce62924ae..17104a1eff 100644 --- a/test/upgrade/main.go +++ b/test/upgrade/main.go @@ -28,8 +28,12 @@ import ( "strings" "time" + agonesv1 "agones.dev/agones/pkg/apis/agones/v1" + "agones.dev/agones/pkg/client/clientset/versioned" + "agones.dev/agones/pkg/client/informers/externalversions" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/fields" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" @@ -48,17 +52,24 @@ const ( SidecarPullPolicy = "true" // LogLevel sets the Agones Helm configuration log level LogLevel = "debug" + // Timeout sets the amount of time to wait for resources to become ready. Should be more than the + // time for an Autopilot cluster to scale up. + Timeout = 10 * time.Minute // HelmChart is the helm chart for the public Agones releases HelmChart = "agones/agones" + // TestChart is the registry for Agones Helm chart development builds + TestChart = "./install/helm" // AgonesRegistry is the public registry for Agones releases AgonesRegistry = "us-docker.pkg.dev/agones-images/release" - // TestRegistry is the public registry for upgrade test container files - TestRegistry = "us-docker.pkg.dev/agones-images/ci/sdk-client-test" + // TestRegistry is the registry for Agones development builds + TestRegistry = "us-docker.pkg.dev/agones-images/ci" + // ContainerRegistry is the registry for upgrade test container files + ContainerRegistry = "us-docker.pkg.dev/agones-images/ci/sdk-client-test" ) var ( - // Dev is the current development version of Agones - Dev = os.Getenv("Dev") + // DevVersion is the current development version of Agones + DevVersion = os.Getenv("DevVersion") // ReleaseVersion is the latest released version of Agones (DEV - 1). ReleaseVersion = os.Getenv("ReleaseVersion") // PodName the name of the pod this container is running in @@ -81,8 +92,14 @@ func main() { log.Fatal("Could not create the kubernetes api clientset", err) } + agonesClient, err := versioned.NewForConfig(cfg) + if err != nil { + log.Fatal("Could not create the agones api clientset") + } + validConfigs := configTestSetup(ctx, kubeClient) - go watchGameServerPods(kubeClient, make(chan struct{}), make(map[string]podLog), len(validConfigs)*2) + go watchGameServers(agonesClient, len(validConfigs)*2) + go watchGameServerEvents(kubeClient) addAgonesRepo() runConfigWalker(ctx, validConfigs) cleanUpResources() @@ -111,9 +128,10 @@ type gameServerTemplate struct { CountsAndLists bool } -type podLog struct { +type gsLog struct { SdkVersion string GameServerVersion string + GameServerState string } type helmStatuses []struct { @@ -136,7 +154,7 @@ func configTestSetup(ctx context.Context, kubeClient *kubernetes.Clientset) []*c // Get the mappings of valid Kubernetes, Agones, and Feature Gate versions from the configmap. err := json.Unmarshal([]byte(VersionMappings), &versionMap) if err != nil { - log.Fatal("Could not Unmarshal", err) + log.Fatal("Could not Unmarshal ", err) } // Find valid Agones versions and feature gates for the current version of Kubernetes. @@ -148,7 +166,7 @@ func configTestSetup(ctx context.Context, kubeClient *kubernetes.Clientset) []*c countsAndLists := containsCountsAndLists(agonesVersion) ct.agonesVersion = agonesVersion if agonesVersion == "Dev" { - ct.agonesVersion = Dev + ct.agonesVersion = DevVersion // Game server container cannot be created at DEV version due to go.mod only able to access // published Agones versions. Use N-1 for DEV. ct.gameServerPath = createGameServerFile(ReleaseVersion, countsAndLists) @@ -285,19 +303,19 @@ func runConfigWalker(ctx context.Context, validConfigs []*configTest) { for _, config := range validConfigs { registry := AgonesRegistry chart := HelmChart - if config.agonesVersion == Dev { - // TODO: Update to templated value for registry and chart for Dev build - continue + if config.agonesVersion == DevVersion { + registry = TestRegistry + chart = TestChart } err := installAgonesRelease(config.agonesVersion, registry, config.featureGates, ImagePullPolicy, SidecarPullPolicy, LogLevel, chart) if err != nil { - log.Printf("installAgonesRelease err: %s", err) + log.Fatalf("installAgonesRelease err: %s", err) } // Wait for the helm release to install. Waits the same amount of time as the Helm timeout. var helmStatus string - err = wait.PollUntilContextTimeout(ctx, 10*time.Second, 10*time.Minute, true, func(ctx context.Context) (done bool, err error) { + err = wait.PollUntilContextTimeout(ctx, 10*time.Second, Timeout, true, func(_ context.Context) (done bool, err error) { helmStatus = checkHelmStatus(config.agonesVersion) if helmStatus == "deployed" { return true, nil @@ -309,7 +327,11 @@ func runConfigWalker(ctx context.Context, validConfigs []*configTest) { config.agonesVersion, helmStatus) } - go createGameServers(cancelCtx, config.gameServerPath) + gsReady := make(chan bool) + go createGameServers(cancelCtx, config.gameServerPath, gsReady) + // Wait for the first game server pod created to become ready + <-gsReady + close(gsReady) // Allow some soak time at the Agones version before next upgrade time.Sleep(1 * time.Minute) } @@ -332,6 +354,12 @@ func checkHelmStatus(agonesVersion string) string { log.Fatal("Could not Unmarshal", err) } + // Remove the commit sha from the DevVersion i.e. from 1.46.0-dev-7168dd3 to 1.46.0-dev + if agonesVersion == DevVersion { + r := regexp.MustCompile(`1\.\d+\.\d+-dev`) + agonesVersion = r.FindString(DevVersion) + } + for _, status := range helmStatus { if status.AppVersion == agonesVersion { return status.Status @@ -342,8 +370,9 @@ func checkHelmStatus(agonesVersion string) string { // Creates a gameserver yaml file from the mounted gameserver.yaml template. The name of the new // gameserver yaml is based on the Agones version, i.e. gs1440.yaml for Agones version 1.44.0 +// Note: This does not validate the created file. func createGameServerFile(agonesVersion string, countsAndLists bool) string { - gsTmpl := gameServerTemplate{Registry: TestRegistry, AgonesVersion: agonesVersion, CountsAndLists: countsAndLists} + gsTmpl := gameServerTemplate{Registry: ContainerRegistry, AgonesVersion: agonesVersion, CountsAndLists: countsAndLists} gsTemplate, err := template.ParseFiles("gameserver.yaml") if err != nil { @@ -377,12 +406,16 @@ func createGameServerFile(agonesVersion string, countsAndLists bool) string { } // Create a game server every five seconds until the context is cancelled. The game server container -// be the same binary version as the game server file. The SDK version is always the same as the +// is the same binary version as the game server file. The SDK version is always the same as the // version of the Agones controller that created it. The Game Server shuts itself down after the // tests have run as part of the `sdk-client-test` logic. -func createGameServers(ctx context.Context, gsPath string) { +func createGameServers(ctx context.Context, gsPath string, gsReady chan bool) { args := []string{"create", "-f", gsPath} + checkFirstGameServerReady(ctx, gsReady, args...) + ticker := time.NewTicker(5 * time.Second) + retries := 8 + retry := 0 for { select { @@ -391,39 +424,82 @@ func createGameServers(ctx context.Context, gsPath string) { return case <-ticker.C: _, err := runExecCommand(KubectlCmd, args...) - // TODO: Do not ignore error if unable to create due to something other than cluster scale up + // Ignore failures for ~45s at at time to account for the brief (~30s) during which the + // controller service is unavailable during upgrade. if err != nil { - log.Printf("Could not create Gameserver %s: %s", gsPath, err) + if retry > retries { + log.Fatalf("Could not create Gameserver %s: %s. Too many successive errors.", gsPath, err) + } + log.Printf("Could not create Gameserver %s: %s. Retries left: %d.", gsPath, err, retries-retry) + retry++ + } else { + retry = 0 } } } } -// watchGameServerPods watches all game server pods for CrashLoopBackOff. Errors if the number of -// CrashLoopBackOff backoff pods exceeds the number of acceptedFailures. -func watchGameServerPods(kubeClient *kubernetes.Clientset, stopCh chan struct{}, failedPods map[string]podLog, acceptedFailures int) { - // Filter by label agones.dev/role=gameserver to only game server pods - labelOptions := informers.WithTweakListOptions(func(opts *metav1.ListOptions) { - opts.LabelSelector = "agones.dev/role=gameserver" +// checkFirstGameServerReady waits for the Game Server Pod to be running. This may take several +// minutes in Autopilot. +func checkFirstGameServerReady(ctx context.Context, gsReady chan bool, args ...string) { + // Sample output: gameserver.agones.dev/sdk-client-test-5zjdn created + output, err := runExecCommand(KubectlCmd, args...) + if err != nil { + log.Fatalf("Could not create Gameserver: %s", err) + } + r := regexp.MustCompile(`sdk-client-test-\S+`) + gsName := r.FindString(string(output)) + // Game Server has too many states, so using the pod instead as there are only two healthy states. + // Includes the gs name to make output logs easier to read. + getPodStatus := []string{"get", "pod", gsName, "-o=custom-columns=:.status.phase,:.metadata.name", "--no-headers"} + + // Pod is created after Game Server, wait briefly before erroring out on unable to get pod. + retries := 0 + err = wait.PollUntilContextTimeout(ctx, 2*time.Second, Timeout, true, func(_ context.Context) (done bool, err error) { + out, err := runExecCommand(KubectlCmd, getPodStatus...) + if err != nil && retries > 2 { + log.Fatalf("Could not get Gameserver %s state: %s", gsName, err) + } + if err != nil { + retries++ + return false, nil + } + // Sample output: Running sdk-client-test-bbvx9 + podStatus := strings.Split(string(out), " ") + if podStatus[0] == "Running" || podStatus[0] == "Succeeded" { + gsReady <- true + return true, nil + } + return false, nil }) - kubeInformerFactory := informers.NewSharedInformerFactoryWithOptions(kubeClient, 5*time.Second, - informers.WithNamespace("default"), labelOptions) - podInformer := kubeInformerFactory.Core().V1().Pods().Informer() + if err != nil { + log.Fatalf("PollUntilContextTimeout timed out while wait for first gameserver %s to be Ready", gsName) + } +} - _, err := podInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ +// watchGameServers watches all game servers for errors. Errors if the number of failed game servers +// exceeds the number of acceptedFailures. +func watchGameServers(agonesClient *versioned.Clientset, acceptedFailures int) { + stopCh := make(chan struct{}) + failedGs := make(map[string]gsLog) + + agonesInformerFactory := externalversions.NewSharedInformerFactory(agonesClient, 5*time.Second) + gsInformer := agonesInformerFactory.Agones().V1().GameServers().Informer() + + _, err := gsInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ UpdateFunc: func(_, newObj interface{}) { - newPod := newObj.(*v1.Pod) - for _, cs := range newPod.Status.ContainerStatuses { - if cs.Name != "sdk-client-test" || cs.State.Waiting == nil || cs.State.Waiting.Reason != "CrashLoopBackOff" { - continue - } - gsVersion := newPod.Labels["agonesVersion"] - sdkVersion := newPod.Annotations["agones.dev/sdk-version"] - log.Printf("%s for pod: %s with game server binary version %s, and SDK version %s", cs.State.Waiting.Reason, newPod.Name, gsVersion, sdkVersion) - // Put failed pods into the map until it reaches capacity. - failedPods[newPod.Name] = podLog{GameServerVersion: gsVersion, SdkVersion: sdkVersion} - if len(failedPods) > acceptedFailures { - log.Fatalf("Too many Game Server pods in CrashLoopBackOff: %v", failedPods) + newGs := newObj.(*agonesv1.GameServer) + if newGs.Status.State == "Error" || newGs.Status.State == "Unhealthy" { + gsVersion := newGs.Labels["agonesVersion"] + sdkVersion := newGs.Annotations["agones.dev/sdk-version"] + log.Printf("Game server %s with binary version %s, and SDK version %s in %s state\n", + newGs.Name, gsVersion, sdkVersion, newGs.Status.State) + + // Put failed game servers into the map until it reaches capacity. + failedGs[newGs.Name] = gsLog{GameServerVersion: gsVersion, SdkVersion: sdkVersion, + GameServerState: string(newGs.Status.State)} + if len(failedGs) > acceptedFailures { + log.Fatalf("Too many Game Servers in Error or Unhealthy states: %v", failedGs) } } }, @@ -432,9 +508,51 @@ func watchGameServerPods(kubeClient *kubernetes.Clientset, stopCh chan struct{}, log.Fatal("Not able to create AddEventHandler", err) } - go podInformer.Run(stopCh) - if !cache.WaitForCacheSync(stopCh, podInformer.HasSynced) { - log.Fatal("Timed out waiting for caches to sync") + go gsInformer.Run(stopCh) + if !cache.WaitForCacheSync(stopCh, gsInformer.HasSynced) { + log.Fatal("Timed out waiting for game server informer cache to sync") + } +} + +// watchGameServerEvents watches all events on `sdk-client-test` containers for BackOff errors. The +// purpose is to catch ImagePullBackOff errors. +func watchGameServerEvents(kubeClient *kubernetes.Clientset) { + stopCh := make(chan struct{}) + + // Filter by Game Server `sdk-client-test` containers + containerName := "sdk-client-test" + containerPath := "spec.containers{sdk-client-test}" + fieldSelector := fields.OneTermEqualSelector("involvedObject.fieldPath", containerPath).String() + // First delete previous `sdk-client-test` events, otherwise there will be events from previous runs. + _, err := runExecCommand(KubectlCmd, []string{"delete", "events", "--field-selector", fieldSelector}...) + if err != nil { + log.Fatal("Could not delete `sdk-client-test` events", err) + } + + eventOptions := informers.WithTweakListOptions(func(opts *metav1.ListOptions) { + opts.FieldSelector = fieldSelector + }) + kubeInformerFactory := informers.NewSharedInformerFactoryWithOptions(kubeClient, 5*time.Second, + informers.WithNamespace("default"), eventOptions) + eventInformer := kubeInformerFactory.Core().V1().Events().Informer() + + _, err = eventInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: func(obj interface{}) { + newEvent := obj.(*v1.Event) + gsPodName := newEvent.InvolvedObject.Name + if newEvent.Reason == "Failed" { + log.Fatalf("%s on %s %s has failed. Latest event: message %s", containerName, newEvent.Kind, + gsPodName, newEvent.Message) + } + }, + }) + if err != nil { + log.Fatal("Not able to create AddEventHandler", err) + } + + go eventInformer.Run(stopCh) + if !cache.WaitForCacheSync(stopCh, eventInformer.HasSynced) { + log.Fatal("Timed out waiting for eventInformer cache to sync") } } @@ -455,7 +573,7 @@ func cleanUpResources() { // Apiservice v1.allocation.agones.dev, which is part of Service agones-system/agones-controller-service, // does not always get cleaned up on Helm uninstall, and needs to be deleted (if it exists) before // the agones-system namespace can be removed. - // Ignore the error, because an "error" means Helm already uninstall the apiservice. + // Ignore the error, because an "error" means Helm already uninstalled the apiservice. args = []string{"delete", "apiservice", "v1.allocation.agones.dev"} out, err := runExecCommand(KubectlCmd, args...) if err == nil { diff --git a/test/upgrade/permissions.yaml b/test/upgrade/permissions.yaml index 1f4a96005b..54e0d57215 100644 --- a/test/upgrade/permissions.yaml +++ b/test/upgrade/permissions.yaml @@ -24,18 +24,18 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: namespace: default - name: pod-reader + name: pod-manager rules: - apiGroups: [""] # "" indicates the core API group - resources: ["pods"] - verbs: ["get", "watch", "list"] + resources: ["pods", "events"] + verbs: ["get", "delete", "list", "watch"] --- apiVersion: rbac.authorization.k8s.io/v1 # This role binding allows default service account to read all pods in the "default" namespace. # You need to already have a Role named "pod-reader" in that namespace. kind: RoleBinding metadata: - name: read-pods + name: manage-pods namespace: default subjects: - kind: ServiceAccount @@ -44,7 +44,7 @@ subjects: roleRef: # "roleRef" specifies the binding to a Role / ClusterRole kind: Role # this must be Role or ClusterRole - name: pod-reader # this must match the name of the Role or ClusterRole you wish to bind to + name: pod-manager # this must match the name of the Role or ClusterRole you wish to bind to apiGroup: rbac.authorization.k8s.io --- kind: ClusterRole @@ -202,23 +202,23 @@ roleRef: apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - name: apiservices-creator + name: apiservices-manager rules: - apiGroups: ["apiregistration.k8s.io"] resources: ["apiservices"] - verbs: ["get", "watch", "list", "create", "patch"] + verbs: ["create", "delete", "get", "list", "patch", "watch"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: create-apiservices + name: manage-apiservices subjects: - kind: ServiceAccount name: agones-sa namespace: default roleRef: kind: ClusterRole - name: apiservices-creator + name: apiservices-manager apiGroup: rbac.authorization.k8s.io --- # Agones needs to be able to create Agones CustomResourceDefinitions @@ -249,23 +249,23 @@ roleRef: apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - name: clusterrole-creator + name: clusterrole-manager rules: - apiGroups: ["rbac.authorization.k8s.io"] resources: ["clusterroles", "clusterrolebindings", "rolebindings"] - verbs: ["get", "watch", "list", "create", "patch"] + verbs: ["create", "delete", "get", "list", "patch", "watch"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: create-clusterroles + name: manager-clusterroles subjects: - kind: ServiceAccount name: agones-sa namespace: default roleRef: kind: ClusterRole - name: clusterrole-creator + name: clusterrole-manager apiGroup: rbac.authorization.k8s.io --- # Agones needs to be able to create deployments @@ -498,3 +498,41 @@ roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: sdk +--- +# Source: agones/templates/hooks/sa.yaml +# Permissions to grant to helm on helm uninstall +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + namespace: agones-system + name: helm-cleanup + labels: + app: agones +rules: + - apiGroups: ["agones.dev", "multicluster.agones.dev", "autoscaling.agones.dev"] + resources: ["fleets", "fleetautoscalers", "gameservers", "gameserversets", "gameserverallocationpolicies"] + verbs: ["delete", "get", "list"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list"] + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["create", "delete", "get", "list"] + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["create", "delete", "get", "list"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: helm-cleanup-access + labels: + app: agones +subjects: + - kind: ServiceAccount + name: agones-sa + namespace: default +roleRef: + kind: ClusterRole + name: helm-cleanup + apiGroup: rbac.authorization.k8s.io diff --git a/test/upgrade/upgradeTest.yaml b/test/upgrade/upgradeTest.yaml index ebb301953c..4b549500aa 100644 --- a/test/upgrade/upgradeTest.yaml +++ b/test/upgrade/upgradeTest.yaml @@ -26,8 +26,7 @@ spec: spec: containers: - name: upgrade-test-controller - # TODO: Update image name to use a templated value for current Dev version - image: us-docker.pkg.dev/agones-images/ci/upgrade-test-controller:1.44.0-dev + image: us-docker.pkg.dev/agones-images/ci/upgrade-test-controller:${DevVersion} imagePullPolicy: Always env: - name: PodName diff --git a/test/upgrade/versionMap.yaml b/test/upgrade/versionMap.yaml index b0a7499de0..7c8a1724cd 100644 --- a/test/upgrade/versionMap.yaml +++ b/test/upgrade/versionMap.yaml @@ -18,82 +18,33 @@ kind: ConfigMap metadata: name: version-map data: - Dev: "1.44.0-dev" - ReleaseVersion: "1.43.0" + DevVersion: ${DevVersion} + ReleaseVersion: "1.45.0" version-mappings.json: | { "k8sToAgonesVersions": { - "1.25": [ - "1.34.0", - "1.35.0" - ], - "1.26": [ - "1.34.0", - "1.35.0", - "1.36.0", - "1.37.0", - "1.38.0", - "1.39.0" - ], - "1.27": [ - "1.34.0", - "1.35.0", - "1.36.0", - "1.37.0", - "1.38.0", - "1.39.0", - "1.40.0", - "1.41.0", - "1.42.0" - ], - "1.28": [ - "1.36.0", - "1.37.0", - "1.38.0", - "1.39.0", - "1.40.0", - "1.41.0", - "1.42.0", - "1.43.0", - "Dev" - ], "1.29": [ "1.40.0", "1.41.0", "1.42.0", "1.43.0", + "1.44.0", + "1.45.0", "Dev" ], "1.30": [ "1.43.0", + "1.44.0", + "1.45.0", + "Dev" + ], + "1.31": [ + "1.44.0", + "1.45.0", "Dev" ] }, "agonesVersionFeatureGates": { - "1.34.0": { - "alphaGates": ["PlayerAllocationFilter", "PlayerTracking"], - "betaGates": [] - }, - "1.35.0": { - "alphaGates": ["PlayerAllocationFilter", "PlayerTracking"], - "betaGates": [] - }, - "1.36.0": { - "alphaGates": ["PlayerAllocationFilter", "PlayerTracking"], - "betaGates": [] - }, - "1.37.0": { - "alphaGates": ["CountsAndLists", "DisableResyncOnSDKServer", "GKEAutopilotExtendedDurationPods", "PlayerAllocationFilter", "PlayerTracking"], - "betaGates": [] - }, - "1.38.0": { - "alphaGates": ["CountsAndLists", "DisableResyncOnSDKServer", "GKEAutopilotExtendedDurationPods", "PlayerAllocationFilter", "PlayerTracking"], - "betaGates": [] - }, - "1.39.0": { - "alphaGates": ["CountsAndLists", "DisableResyncOnSDKServer", "GKEAutopilotExtendedDurationPods", "PlayerAllocationFilter", "PlayerTracking"], - "betaGates": [] - }, "1.40.0": { "alphaGates": ["CountsAndLists", "GKEAutopilotExtendedDurationPods", "PlayerAllocationFilter", "PlayerTracking"], "betaGates": ["DisableResyncOnSDKServer"] @@ -110,9 +61,17 @@ data: "alphaGates": ["GKEAutopilotExtendedDurationPods", "PlayerAllocationFilter", "PlayerTracking", "PortPolicyNone", "PortRanges", "RollingUpdateFix"], "betaGates": ["AutopilotPassthroughPort", "CountsAndLists", "DisableResyncOnSDKServer"] }, + "1.44.0": { + "alphaGates": ["PlayerAllocationFilter", "PlayerTracking", "PortPolicyNone", "PortRanges", "RollingUpdateFix", "ScheduledAutoscaler"], + "betaGates": ["AutopilotPassthroughPort", "CountsAndLists", "DisableResyncOnSDKServer", "GKEAutopilotExtendedDurationPods"] + }, + "1.45.0": { + "alphaGates": ["PlayerAllocationFilter", "PlayerTracking", "PortPolicyNone", "PortRanges", "RollingUpdateFix", "ScheduledAutoscaler"], + "betaGates": ["AutopilotPassthroughPort", "CountsAndLists", "DisableResyncOnSDKServer", "GKEAutopilotExtendedDurationPods"] + }, "Dev": { - "alphaGates": ["GKEAutopilotExtendedDurationPods", "PlayerAllocationFilter", "PlayerTracking", "PortPolicyNone", "PortRanges", "RollingUpdateFix", "ScheduledAutoscaler"], - "betaGates": ["AutopilotPassthroughPort", "CountsAndLists", "DisableResyncOnSDKServer"] + "alphaGates": ["PlayerAllocationFilter", "PlayerTracking", "PortPolicyNone", "PortRanges", "RollingUpdateFix", "ScheduledAutoscaler"], + "betaGates": ["AutopilotPassthroughPort", "CountsAndLists", "DisableResyncOnSDKServer", "GKEAutopilotExtendedDurationPods"] } } }