From 80dd2eaf04308f6b9ffee49925d4e4378390de13 Mon Sep 17 00:00:00 2001 From: Zach Loafman Date: Thu, 30 May 2024 16:04:04 -0700 Subject: [PATCH] agones-{extensions,allocator}: Pause after cancelling context (#3843) * Revert "agones-{extensions,allocator}: Be more defensive about draining" This reverts commit 68b04ee9dc59ca2d2f21118e8cadd754f9d2fc37. * agones-{extensions,allocator}: Pause after cancelling context After #3839 went in, we noticed the flakes in TestAllocatorAfterDeleteReplica disappear, but TestGameServerCreationRightAfterDeletingOneExtensionsPod remained. I looked more closely at this and I developed a theory: My guess is that `kube-apiserver` connections to webhooks are actually "sticky" using http(s) keepalives. If the TCP connection never closes, we'd see the behavior we do, which is the `EOF` from kube-apiserver going to write on a dead socket. I looked at how to close sockets on the server side to "finish" the drain, but realized that we actually do, by cancelling the context. The thing is that we cancel the context and immediately exit, but it leaves no time for anything to shut down. I tested this and it seems to drive away the flake, without adding in the extra delay. Reverts #3839 --- cmd/allocator/main.go | 3 ++- cmd/extensions/main.go | 3 ++- install/helm/agones/templates/extensions-deployment.yaml | 4 ++-- install/helm/agones/templates/service/allocation.yaml | 4 ++-- install/yaml/install.yaml | 8 ++++---- test/e2e/allocator/pod_termination_test.go | 2 +- test/e2e/extensions/high_availability_test.go | 2 +- 7 files changed, 14 insertions(+), 12 deletions(-) diff --git a/cmd/allocator/main.go b/cmd/allocator/main.go index 3c22d9b813..9e31cbda83 100644 --- a/cmd/allocator/main.go +++ b/cmd/allocator/main.go @@ -247,7 +247,8 @@ func main() { podReady = false time.Sleep(conf.ReadinessShutdownDuration) cancelCtx() - logger.Infof("Readiness shutdown duration has passed, exiting pod") + logger.Infof("Readiness shutdown duration has passed, context cancelled") + time.Sleep(1 * time.Second) // allow a brief time for cleanup, but force exit if main doesn't os.Exit(0) }) diff --git a/cmd/extensions/main.go b/cmd/extensions/main.go index f09e83aaea..31987b1037 100644 --- a/cmd/extensions/main.go +++ b/cmd/extensions/main.go @@ -190,7 +190,8 @@ func main() { podReady = false time.Sleep(ctlConf.ReadinessShutdownDuration) cancelCtx() - logger.Infof("Readiness shutdown duration has passed, exiting pod") + logger.Infof("Readiness shutdown duration has passed, context cancelled") + time.Sleep(1 * time.Second) // allow a brief time for cleanup, but force exit if main doesn't os.Exit(0) }) diff --git a/install/helm/agones/templates/extensions-deployment.yaml b/install/helm/agones/templates/extensions-deployment.yaml index 8e8bee40f3..9e7dc35d08 100644 --- a/install/helm/agones/templates/extensions-deployment.yaml +++ b/install/helm/agones/templates/extensions-deployment.yaml @@ -92,7 +92,7 @@ spec: priorityClassName: {{ .Values.agones.priorityClassName }} {{- end }} serviceAccountName: {{ .Values.agones.serviceaccount.controller.name }} - terminationGracePeriodSeconds: {{ mul .Values.agones.extensions.readiness.periodSeconds .Values.agones.extensions.readiness.failureThreshold 5 }} + terminationGracePeriodSeconds: {{ mul .Values.agones.extensions.readiness.periodSeconds .Values.agones.extensions.readiness.failureThreshold 3 }} containers: - name: agones-extensions image: "{{ .Values.agones.image.registry }}/{{ .Values.agones.image.extensions.name}}:{{ default .Values.agones.image.tag .Values.agones.image.extensions.tag }}" @@ -137,7 +137,7 @@ spec: - name: CONTAINER_NAME value: "agones-extensions" - name: READINESS_SHUTDOWN_DURATION - value: {{ mul .Values.agones.extensions.readiness.periodSeconds .Values.agones.extensions.readiness.failureThreshold 4 }}s + value: {{ mul .Values.agones.extensions.readiness.periodSeconds .Values.agones.extensions.readiness.failureThreshold 2 }}s ports: - name: webhooks containerPort: 8081 diff --git a/install/helm/agones/templates/service/allocation.yaml b/install/helm/agones/templates/service/allocation.yaml index 7d5794a205..cd3ae400e3 100644 --- a/install/helm/agones/templates/service/allocation.yaml +++ b/install/helm/agones/templates/service/allocation.yaml @@ -213,7 +213,7 @@ spec: {{ toYaml .Values.agones.allocator.tolerations | indent 8 }} {{- end }} serviceAccountName: {{ $.Values.agones.serviceaccount.allocator.name }} - terminationGracePeriodSeconds: {{ mul .Values.agones.allocator.readiness.periodSeconds .Values.agones.allocator.readiness.failureThreshold 5 }} + terminationGracePeriodSeconds: {{ mul .Values.agones.allocator.readiness.periodSeconds .Values.agones.allocator.readiness.failureThreshold 3 }} {{- if eq .Values.agones.allocator.disableTLS false }} volumes: - name: tls @@ -292,7 +292,7 @@ spec: - name: ALLOCATION_BATCH_WAIT_TIME value: {{ .Values.agones.allocator.allocationBatchWaitTime | quote }} - name: READINESS_SHUTDOWN_DURATION - value: {{ mul .Values.agones.allocator.readiness.periodSeconds .Values.agones.extensions.readiness.failureThreshold 4 }}s + value: {{ mul .Values.agones.allocator.readiness.periodSeconds .Values.agones.extensions.readiness.failureThreshold 2 }}s ports: {{- if .Values.agones.allocator.service.http.enabled }} - name: {{ .Values.agones.allocator.service.http.portName }} diff --git a/install/yaml/install.yaml b/install/yaml/install.yaml index 2f1c963cbb..1cf30b8786 100644 --- a/install/yaml/install.yaml +++ b/install/yaml/install.yaml @@ -17216,7 +17216,7 @@ spec: value: "true" priorityClassName: agones-system serviceAccountName: agones-controller - terminationGracePeriodSeconds: 45 + terminationGracePeriodSeconds: 27 containers: - name: agones-extensions image: "us-docker.pkg.dev/agones-images/release/agones-extensions:1.41.0-dev" @@ -17259,7 +17259,7 @@ spec: - name: CONTAINER_NAME value: "agones-extensions" - name: READINESS_SHUTDOWN_DURATION - value: 36s + value: 18s ports: - name: webhooks containerPort: 8081 @@ -17420,7 +17420,7 @@ spec: operator: Equal value: "true" serviceAccountName: agones-allocator - terminationGracePeriodSeconds: 45 + terminationGracePeriodSeconds: 27 volumes: - name: tls secret: @@ -17491,7 +17491,7 @@ spec: - name: ALLOCATION_BATCH_WAIT_TIME value: "500ms" - name: READINESS_SHUTDOWN_DURATION - value: 36s + value: 18s ports: - name: https containerPort: 8443 diff --git a/test/e2e/allocator/pod_termination_test.go b/test/e2e/allocator/pod_termination_test.go index 58fcf456d8..fc051b739a 100644 --- a/test/e2e/allocator/pod_termination_test.go +++ b/test/e2e/allocator/pod_termination_test.go @@ -34,7 +34,7 @@ import ( const ( retryInterval = 5 * time.Second - retryTimeout = 60 * time.Second + retryTimeout = 45 * time.Second ) func TestAllocatorAfterDeleteReplica(t *testing.T) { diff --git a/test/e2e/extensions/high_availability_test.go b/test/e2e/extensions/high_availability_test.go index 74c3fdf499..49ec425628 100644 --- a/test/e2e/extensions/high_availability_test.go +++ b/test/e2e/extensions/high_availability_test.go @@ -84,7 +84,7 @@ func TestGameServerCreationRightAfterDeletingOneExtensionsPod(t *testing.T) { logger.Infof("Removing one of the Extensions Pods: %v", list.Items[1].ObjectMeta.Name) deleteAgonesExtensionsPod(ctx, t, false) - endTime := time.Now().Add(60 * time.Second) + endTime := time.Now().Add(30 * time.Second) for time.Now().Before(endTime) { gs := framework.DefaultGameServer(defaultNs) logger.Infof("Creating game-server %s...", gs.Name)