From c26eb4c1733d0ca2954bb148c8e9bb6a11763217 Mon Sep 17 00:00:00 2001 From: Aleem Haji Date: Tue, 11 Jun 2024 22:07:56 -0400 Subject: [PATCH] Moved job cleaner to managed namespaces --- hope.yaml | 12 ++ infra/README.md | 1 + infra/delete-manual-jobs-script.sh | 38 +++++++ infra/namespace.yaml | 80 +++++++++++++ infra/pod-killer-script.sh | 1 + shell-monitor/delete-manual-jobs-monitor.sh | 21 ---- shell-monitor/delete-manual-jobs-monitor.yaml | 105 ------------------ 7 files changed, 132 insertions(+), 126 deletions(-) create mode 100644 infra/delete-manual-jobs-script.sh delete mode 100644 shell-monitor/delete-manual-jobs-monitor.sh delete mode 100644 shell-monitor/delete-manual-jobs-monitor.yaml diff --git a/hope.yaml b/hope.yaml index 9e82121..10b44c2 100644 --- a/hope.yaml +++ b/hope.yaml @@ -148,9 +148,11 @@ resources: - INCLUDE_EXTERNAL_CERTS=true - INCLUDE_BARE_DOMAIN=true - POD_KILLER_CONTAINER_RESTART_LIMIT=10 + - JOB_RETENTION_WINDOW=1 month fileParameters: - UPDATE_SECRETS_SCRIPT=infra/certbot-copy-script.sh - POD_KILLER_SCRIPT=infra/pod-killer-script.sh + - JOBS_SHELL_MONITOR_SCRIPT=infra/delete-manual-jobs-script.sh tags: [namespaces] - name: dev-namespace file: infra/namespace.yaml @@ -162,9 +164,11 @@ resources: - INCLUDE_EXTERNAL_CERTS=false - INCLUDE_BARE_DOMAIN=false - POD_KILLER_CONTAINER_RESTART_LIMIT=10 + - JOB_RETENTION_WINDOW=1 month fileParameters: - UPDATE_SECRETS_SCRIPT=infra/certbot-copy-script.sh - POD_KILLER_SCRIPT=infra/pod-killer-script.sh + - JOBS_SHELL_MONITOR_SCRIPT=infra/delete-manual-jobs-script.sh tags: [namespaces, rotate-node] - name: monitoring-namespace file: infra/namespace.yaml @@ -176,9 +180,11 @@ resources: - INCLUDE_EXTERNAL_CERTS=false - INCLUDE_BARE_DOMAIN=false - POD_KILLER_CONTAINER_RESTART_LIMIT=10 + - JOB_RETENTION_WINDOW=1 month fileParameters: - UPDATE_SECRETS_SCRIPT=infra/certbot-copy-script.sh - POD_KILLER_SCRIPT=infra/pod-killer-script.sh + - JOBS_SHELL_MONITOR_SCRIPT=infra/delete-manual-jobs-script.sh tags: [namespaces, monitoring] - name: kube-system-namespace file: infra/namespace.yaml @@ -190,9 +196,11 @@ resources: - INCLUDE_EXTERNAL_CERTS=false - INCLUDE_BARE_DOMAIN=false - POD_KILLER_CONTAINER_RESTART_LIMIT=10 + - JOB_RETENTION_WINDOW=1 month fileParameters: - UPDATE_SECRETS_SCRIPT=infra/certbot-copy-script.sh - POD_KILLER_SCRIPT=infra/pod-killer-script.sh + - JOBS_SHELL_MONITOR_SCRIPT=infra/delete-manual-jobs-script.sh tags: [namespaces] - name: kubernetes-dashboard-namespace file: infra/namespace.yaml @@ -204,9 +212,11 @@ resources: - INCLUDE_EXTERNAL_CERTS=false - INCLUDE_BARE_DOMAIN=false - POD_KILLER_CONTAINER_RESTART_LIMIT=10 + - JOB_RETENTION_WINDOW=1 month fileParameters: - UPDATE_SECRETS_SCRIPT=infra/certbot-copy-script.sh - POD_KILLER_SCRIPT=infra/pod-killer-script.sh + - JOBS_SHELL_MONITOR_SCRIPT=infra/delete-manual-jobs-script.sh tags: [namespaces] - name: tasks-namespace file: infra/namespace.yaml @@ -218,9 +228,11 @@ resources: - INCLUDE_EXTERNAL_CERTS=false - INCLUDE_BARE_DOMAIN=false - POD_KILLER_CONTAINER_RESTART_LIMIT=10 + - JOB_RETENTION_WINDOW=1 month fileParameters: - UPDATE_SECRETS_SCRIPT=infra/certbot-copy-script.sh - POD_KILLER_SCRIPT=infra/pod-killer-script.sh + - JOBS_SHELL_MONITOR_SCRIPT=infra/delete-manual-jobs-script.sh tags: [namespaces, rmq, tasks] # endregion - name: calico diff --git a/infra/README.md b/infra/README.md index c045410..95233f9 100644 --- a/infra/README.md +++ b/infra/README.md @@ -4,3 +4,4 @@ Each namespace ships with: - A set of docker registry secrets - A `CronJob` to copy SSL certs from filesystem to Kubernetes `Secret` - A `CronJob` to kill pods whose containers have > 10 restarts +- A `CronJob` to delete old manual job runs diff --git a/infra/delete-manual-jobs-script.sh b/infra/delete-manual-jobs-script.sh new file mode 100644 index 0000000..8af44c9 --- /dev/null +++ b/infra/delete-manual-jobs-script.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env sh +# +# Delete jobs that appear to be CronJobs that have been run manually from the +# Kubernetes Dashboard. +set -euf + +SLACK_URL="https://slackbot.internal.aleemhaji.com/message" + +slack() { + curl -sS -X POST -H "X-SLACK-CHANNEL-ID: ${SLACK_BOT_ALERTING_CHANNEL}" -d "$@" "$SLACK_URL" +} + +if [ $# -ne 2 ]; then + echo >&2 "Usage:" + echo >&2 " $0 " + exit 1 +fi + +namespace="$1" +age_str="$2" + +MANUAL_JOB_REGEXP='-manual-[[:alnum:]]\{3,5\}[[:space:]]' +ONE_MONTH_AGO="$(date -u -d "$age_str ago" '+%Y-%m-%dT%H:%M:%SZ')" +JOBS_COLUMNS='custom-columns=NAME:{.metadata.name},SUCCEEDED:{.status.succeeded},COMPLETED:{.status.completionTime}' +AWK_SCRIPT='{if ($2 == 1 && $3 < arg) print $1}' + +slack 'Manual job-run cleaup running on '"$(hostname)"'. +Deleting successful manually run jobs in namespace "'"$namespace"'" older than '"$age_str"'.' + +while true; do + echo "Run: $(date)" + kubectl -n "$namespace" get jobs -o "$JOBS_COLUMNS" | sed '1d' | grep -- "$MANUAL_JOB_REGEXP" | awk -v "arg=$ONE_MONTH_AGO" "$AWK_SCRIPT" | while read -r job; do + slack "Job monitor deleting old manually run job: $job" + kubectl -n "${namespace}" delete job "$job" + done + + sleep 3600 +done diff --git a/infra/namespace.yaml b/infra/namespace.yaml index 40e5517..586b7d8 100644 --- a/infra/namespace.yaml +++ b/infra/namespace.yaml @@ -181,3 +181,83 @@ spec: configMap: name: pod-killer-config defaultMode: 0755 +--- +# endregion +# region: Job Cleaner +apiVersion: v1 +kind: ServiceAccount +metadata: + name: jobs-monitor + namespace: ${NAMESPACE} +--- +kind: Role +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: jobs-monitor + namespace: ${NAMESPACE} +rules: + - apiGroups: [batch] + resources: [jobs] + verbs: [list, delete] +--- +kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: jobs-monitor + namespace: ${NAMESPACE} +subjects: + - kind: ServiceAccount + name: jobs-monitor +roleRef: + kind: Role + name: jobs-monitor + apiGroup: rbac.authorization.k8s.io +--- +kind: ConfigMap +apiVersion: v1 +metadata: + name: job-monitor-script + namespace: ${NAMESPACE} + labels: + app: jobs-monitor +binaryData: + script.sh: ${JOBS_SHELL_MONITOR_SCRIPT} +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: jobs-cleaner + namespace: ${NAMESPACE} + labels: + app: jobs-cleaner +spec: + revisionHistoryLimit: 0 + replicas: 1 + selector: + matchLabels: + app: jobs-cleaner + template: + metadata: + labels: + app: jobs-cleaner + spec: + serviceAccountName: jobs-monitor + imagePullSecrets: + - name: registry.internal.aleemhaji.com + containers: + - name: jobs-cleaner + image: registry.internal.aleemhaji.com/kubectl:1.21.0 + command: + - /scripts/script.sh + - "${NAMESPACE}" + - "${JOB_RETENTION_WINDOW}" + volumeMounts: + - name: job-monitor-script + mountPath: /scripts + volumes: + - name: job-monitor-script + configMap: + name: job-monitor-script + defaultMode: 0755 +--- +# endregion diff --git a/infra/pod-killer-script.sh b/infra/pod-killer-script.sh index c68df43..3a95f91 100644 --- a/infra/pod-killer-script.sh +++ b/infra/pod-killer-script.sh @@ -26,6 +26,7 @@ slack 'Pod killer starting up on '"$(hostname)"'. Killing pods in namespace "'"$namespace"'" with '"$n_restarts"' or more container restarts.' while true; do + echo "Run: $(date)" kubectl get pods -n "${namespace}" -o template="$pod_template" | awk '{for (i = 0; i < $2; i++) print $1}' | uniq -c | awk '$1 >= '"$n_restarts"' { print $2 }' | while read -r pod; do slack "Pod killer is killing \"${namespace}/$pod\"" kubectl delete pod -n "${namespace}" "$pod" diff --git a/shell-monitor/delete-manual-jobs-monitor.sh b/shell-monitor/delete-manual-jobs-monitor.sh deleted file mode 100644 index 30b732c..0000000 --- a/shell-monitor/delete-manual-jobs-monitor.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env sh -# -# Delete jobs that appear to be CronJobs that have been run manually from the -# Kubernetes Dashboard. -set -euf - -MANUAL_JOB_REGEXP='-manual-[[:alnum:]]\{3\}[[:space:]]' -ONE_MONTH_AGO="$(date -u -d '1 month ago' '+%Y-%m-%dT%H:%M:%SZ')" - -# Newline to start the monitored output. -echo "" - -kubectl -n "$KUBE_NAMESPACE" get jobs -o custom-columns="NAME:{.metadata.name},SUCCEEDED:{.status.succeeded},COMPLETED:{.status.completionTime}" | \ - sed '1d' |\ - grep -- "$MANUAL_JOB_REGEXP" |\ - awk '{if ($2 == 1) print}' |\ - awk -v "arg=$ONE_MONTH_AGO" '{if ($3 < arg) print $1}' |\ - while read -r job; do - echo "Kubernetes job monitor deleting old manually run job: $job" - kubectl -n "${KUBE_NAMESPACE}" delete job "$job" -done diff --git a/shell-monitor/delete-manual-jobs-monitor.yaml b/shell-monitor/delete-manual-jobs-monitor.yaml deleted file mode 100644 index 0334ca4..0000000 --- a/shell-monitor/delete-manual-jobs-monitor.yaml +++ /dev/null @@ -1,105 +0,0 @@ -apiVersion: v1 -kind: ServiceAccount -metadata: - name: jobs-monitor - namespace: ${KUBE_NAMESPACE} ---- -kind: Role -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: jobs-monitor - namespace: ${KUBE_NAMESPACE} -rules: - - apiGroups: - - "batch" - resources: - - jobs - verbs: - - list - - delete ---- -kind: RoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: jobs-monitor-shell-monitor - namespace: ${KUBE_NAMESPACE} -subjects: - - kind: ServiceAccount - name: jobs-monitor -roleRef: - kind: Role - name: shell-monitor - apiGroup: rbac.authorization.k8s.io ---- -kind: RoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: jobs-monitor-jobs-monitor - namespace: ${KUBE_NAMESPACE} -subjects: - - kind: ServiceAccount - name: jobs-monitor -roleRef: - kind: Role - name: jobs-monitor - apiGroup: rbac.authorization.k8s.io ---- -kind: ConfigMap -apiVersion: v1 -metadata: - name: job-monitor-script - namespace: ${KUBE_NAMESPACE} - labels: - app: jobs-shell-monitor -binaryData: - script.sh: ${JOBS_SHELL_MONITOR_SCRIPT} ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: jobs-shell-monitor-deployment - namespace: ${KUBE_NAMESPACE} - labels: - app: jobs-shell-monitor -spec: - revisionHistoryLimit: 0 - replicas: 1 - selector: - matchLabels: - app: jobs-shell-monitor - template: - metadata: - labels: - app: jobs-shell-monitor - spec: - serviceAccountName: jobs-monitor - imagePullSecrets: - - name: registry.internal.aleemhaji.com - containers: - - name: shell-monitor - image: registry.internal.aleemhaji.com/kubectl:1.21.0 - command: - - /scripts/base/script.sh - env: - - name: CONFIG_MAP_NAME - value: recent-jobs-monitor - - name: UPDATE_SCRIPT - value: /scripts/monitor/script.sh - - name: UPDATE_INTERVAL - value: "3600" - - name: SLACK_BOT_ALERTING_CHANNEL - value: ${SLACK_BOT_ALERTING_CHANNEL} - volumeMounts: - - name: shell-monitor-base-scripts - mountPath: /scripts/base - - name: job-monitor-script - mountPath: /scripts/monitor - volumes: - - name: shell-monitor-base-scripts - configMap: - name: shell-monitor-base-config - defaultMode: 0755 - - name: job-monitor-script - configMap: - name: job-monitor-script - defaultMode: 0755