Skip to content

Commit

Permalink
Moved job cleaner to managed namespaces
Browse files Browse the repository at this point in the history
  • Loading branch information
Eagerod committed Jun 12, 2024
1 parent d63ad3f commit c26eb4c
Show file tree
Hide file tree
Showing 7 changed files with 132 additions and 126 deletions.
12 changes: 12 additions & 0 deletions hope.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -148,9 +148,11 @@ resources:
- INCLUDE_EXTERNAL_CERTS=true
- INCLUDE_BARE_DOMAIN=true
- POD_KILLER_CONTAINER_RESTART_LIMIT=10
- JOB_RETENTION_WINDOW=1 month
fileParameters:
- UPDATE_SECRETS_SCRIPT=infra/certbot-copy-script.sh
- POD_KILLER_SCRIPT=infra/pod-killer-script.sh
- JOBS_SHELL_MONITOR_SCRIPT=infra/delete-manual-jobs-script.sh
tags: [namespaces]
- name: dev-namespace
file: infra/namespace.yaml
Expand All @@ -162,9 +164,11 @@ resources:
- INCLUDE_EXTERNAL_CERTS=false
- INCLUDE_BARE_DOMAIN=false
- POD_KILLER_CONTAINER_RESTART_LIMIT=10
- JOB_RETENTION_WINDOW=1 month
fileParameters:
- UPDATE_SECRETS_SCRIPT=infra/certbot-copy-script.sh
- POD_KILLER_SCRIPT=infra/pod-killer-script.sh
- JOBS_SHELL_MONITOR_SCRIPT=infra/delete-manual-jobs-script.sh
tags: [namespaces, rotate-node]
- name: monitoring-namespace
file: infra/namespace.yaml
Expand All @@ -176,9 +180,11 @@ resources:
- INCLUDE_EXTERNAL_CERTS=false
- INCLUDE_BARE_DOMAIN=false
- POD_KILLER_CONTAINER_RESTART_LIMIT=10
- JOB_RETENTION_WINDOW=1 month
fileParameters:
- UPDATE_SECRETS_SCRIPT=infra/certbot-copy-script.sh
- POD_KILLER_SCRIPT=infra/pod-killer-script.sh
- JOBS_SHELL_MONITOR_SCRIPT=infra/delete-manual-jobs-script.sh
tags: [namespaces, monitoring]
- name: kube-system-namespace
file: infra/namespace.yaml
Expand All @@ -190,9 +196,11 @@ resources:
- INCLUDE_EXTERNAL_CERTS=false
- INCLUDE_BARE_DOMAIN=false
- POD_KILLER_CONTAINER_RESTART_LIMIT=10
- JOB_RETENTION_WINDOW=1 month
fileParameters:
- UPDATE_SECRETS_SCRIPT=infra/certbot-copy-script.sh
- POD_KILLER_SCRIPT=infra/pod-killer-script.sh
- JOBS_SHELL_MONITOR_SCRIPT=infra/delete-manual-jobs-script.sh
tags: [namespaces]
- name: kubernetes-dashboard-namespace
file: infra/namespace.yaml
Expand All @@ -204,9 +212,11 @@ resources:
- INCLUDE_EXTERNAL_CERTS=false
- INCLUDE_BARE_DOMAIN=false
- POD_KILLER_CONTAINER_RESTART_LIMIT=10
- JOB_RETENTION_WINDOW=1 month
fileParameters:
- UPDATE_SECRETS_SCRIPT=infra/certbot-copy-script.sh
- POD_KILLER_SCRIPT=infra/pod-killer-script.sh
- JOBS_SHELL_MONITOR_SCRIPT=infra/delete-manual-jobs-script.sh
tags: [namespaces]
- name: tasks-namespace
file: infra/namespace.yaml
Expand All @@ -218,9 +228,11 @@ resources:
- INCLUDE_EXTERNAL_CERTS=false
- INCLUDE_BARE_DOMAIN=false
- POD_KILLER_CONTAINER_RESTART_LIMIT=10
- JOB_RETENTION_WINDOW=1 month
fileParameters:
- UPDATE_SECRETS_SCRIPT=infra/certbot-copy-script.sh
- POD_KILLER_SCRIPT=infra/pod-killer-script.sh
- JOBS_SHELL_MONITOR_SCRIPT=infra/delete-manual-jobs-script.sh
tags: [namespaces, rmq, tasks]
# endregion
- name: calico
Expand Down
1 change: 1 addition & 0 deletions infra/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ Each namespace ships with:
- A set of docker registry secrets
- A `CronJob` to copy SSL certs from filesystem to Kubernetes `Secret`
- A `CronJob` to kill pods whose containers have > 10 restarts
- A `CronJob` to delete old manual job runs
38 changes: 38 additions & 0 deletions infra/delete-manual-jobs-script.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/usr/bin/env sh
#
# Delete jobs that appear to be CronJobs that have been run manually from the
# Kubernetes Dashboard.
set -euf

SLACK_URL="https://slackbot.internal.aleemhaji.com/message"

slack() {
curl -sS -X POST -H "X-SLACK-CHANNEL-ID: ${SLACK_BOT_ALERTING_CHANNEL}" -d "$@" "$SLACK_URL"
}

if [ $# -ne 2 ]; then
echo >&2 "Usage:"
echo >&2 " $0 <namespace> <age>"
exit 1
fi

namespace="$1"
age_str="$2"

MANUAL_JOB_REGEXP='-manual-[[:alnum:]]\{3,5\}[[:space:]]'
ONE_MONTH_AGO="$(date -u -d "$age_str ago" '+%Y-%m-%dT%H:%M:%SZ')"
JOBS_COLUMNS='custom-columns=NAME:{.metadata.name},SUCCEEDED:{.status.succeeded},COMPLETED:{.status.completionTime}'
AWK_SCRIPT='{if ($2 == 1 && $3 < arg) print $1}'

slack 'Manual job-run cleaup running on '"$(hostname)"'.
Deleting successful manually run jobs in namespace "'"$namespace"'" older than '"$age_str"'.'

while true; do
echo "Run: $(date)"
kubectl -n "$namespace" get jobs -o "$JOBS_COLUMNS" | sed '1d' | grep -- "$MANUAL_JOB_REGEXP" | awk -v "arg=$ONE_MONTH_AGO" "$AWK_SCRIPT" | while read -r job; do
slack "Job monitor deleting old manually run job: $job"
kubectl -n "${namespace}" delete job "$job"
done

sleep 3600
done
80 changes: 80 additions & 0 deletions infra/namespace.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -181,3 +181,83 @@ spec:
configMap:
name: pod-killer-config
defaultMode: 0755
---
# endregion
# region: Job Cleaner
apiVersion: v1
kind: ServiceAccount
metadata:
name: jobs-monitor
namespace: ${NAMESPACE}
---
kind: Role
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: jobs-monitor
namespace: ${NAMESPACE}
rules:
- apiGroups: [batch]
resources: [jobs]
verbs: [list, delete]
---
kind: RoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: jobs-monitor
namespace: ${NAMESPACE}
subjects:
- kind: ServiceAccount
name: jobs-monitor
roleRef:
kind: Role
name: jobs-monitor
apiGroup: rbac.authorization.k8s.io
---
kind: ConfigMap
apiVersion: v1
metadata:
name: job-monitor-script
namespace: ${NAMESPACE}
labels:
app: jobs-monitor
binaryData:
script.sh: ${JOBS_SHELL_MONITOR_SCRIPT}
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: jobs-cleaner
namespace: ${NAMESPACE}
labels:
app: jobs-cleaner
spec:
revisionHistoryLimit: 0
replicas: 1
selector:
matchLabels:
app: jobs-cleaner
template:
metadata:
labels:
app: jobs-cleaner
spec:
serviceAccountName: jobs-monitor
imagePullSecrets:
- name: registry.internal.aleemhaji.com
containers:
- name: jobs-cleaner
image: registry.internal.aleemhaji.com/kubectl:1.21.0
command:
- /scripts/script.sh
- "${NAMESPACE}"
- "${JOB_RETENTION_WINDOW}"
volumeMounts:
- name: job-monitor-script
mountPath: /scripts
volumes:
- name: job-monitor-script
configMap:
name: job-monitor-script
defaultMode: 0755
---
# endregion
1 change: 1 addition & 0 deletions infra/pod-killer-script.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ slack 'Pod killer starting up on '"$(hostname)"'.
Killing pods in namespace "'"$namespace"'" with '"$n_restarts"' or more container restarts.'

while true; do
echo "Run: $(date)"
kubectl get pods -n "${namespace}" -o template="$pod_template" | awk '{for (i = 0; i < $2; i++) print $1}' | uniq -c | awk '$1 >= '"$n_restarts"' { print $2 }' | while read -r pod; do
slack "Pod killer is killing \"${namespace}/$pod\""
kubectl delete pod -n "${namespace}" "$pod"
Expand Down
21 changes: 0 additions & 21 deletions shell-monitor/delete-manual-jobs-monitor.sh

This file was deleted.

105 changes: 0 additions & 105 deletions shell-monitor/delete-manual-jobs-monitor.yaml

This file was deleted.

0 comments on commit c26eb4c

Please sign in to comment.