diff --git a/certbot/certbot-generic-cron.yaml b/certbot/certbot-generic-cron.yaml deleted file mode 100644 index 3bed6de..0000000 --- a/certbot/certbot-generic-cron.yaml +++ /dev/null @@ -1,86 +0,0 @@ -# The certbot update itself doesn't require any kind of special Kubernetes -# access, but the process that copies the secrets from the file share to -# Kubernetes' Secrets store does need to make API calls to Kubernetes. -# Create an account, and give it the ability to read and write Secrets in the -# namespace this cron will copy data to. -apiVersion: v1 -kind: ServiceAccount -metadata: - name: certbot - namespace: ${KUBERNETES_NAMESPACE} ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: certbot-role - namespace: ${KUBERNETES_NAMESPACE} -rules: - - apiGroups: [""] - resources: ["secrets"] - verbs: ["get", "list", "patch", "create"] # Might actually be able to remove "list"? ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: certbot-manage-secrets - namespace: ${KUBERNETES_NAMESPACE} -subjects: - - kind: ServiceAccount - name: certbot -roleRef: - kind: Role - name: certbot-role - apiGroup: rbac.authorization.k8s.io ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: certbot-update-script - namespace: ${KUBERNETES_NAMESPACE} -binaryData: - update-secrets.sh: "${UPDATE_SECRETS_SCRIPT}" ---- -apiVersion: batch/v1beta1 -kind: CronJob -metadata: - name: certbot-copy-to-${KUBERNETES_NAMESPACE} - namespace: ${KUBERNETES_NAMESPACE} -spec: - schedule: "0 0 * * *" - successfulJobsHistoryLimit: 1 - failedJobsHistoryLimit: 3 - concurrencyPolicy: Forbid - jobTemplate: - spec: - template: - metadata: - labels: - app: certbot-copy-to-${KUBERNETES_NAMESPACE} - spec: - restartPolicy: OnFailure - serviceAccountName: certbot - imagePullSecrets: - - name: registry.internal.aleemhaji.com - containers: - - name: certbot-copy-to-${KUBERNETES_NAMESPACE} - image: registry.internal.aleemhaji.com/kubectl:1.21.0 - command: - - sh - - /scripts/update-secrets.sh - env: - - name: KUBERNETES_NAMESPACE - value: ${KUBERNETES_NAMESPACE} - volumeMounts: - - name: certbot-storage - mountPath: /etc/letsencrypt - - name: certbot-scripts - mountPath: /scripts - volumes: - - name: certbot-storage - nfs: - server: 192.168.96.4 - path: /mnt/main/apps/certificates - - name: certbot-scripts - configMap: - name: certbot-update-script - defaultMode: 0755 diff --git a/hope.yaml b/hope.yaml index 0354e68..10b44c2 100644 --- a/hope.yaml +++ b/hope.yaml @@ -138,29 +138,102 @@ loglevel: *log_level pod_network_cidr: 10.244.0.0/16 resources: # region: Namespaces - - name: load-balancer-namespace - inline: | - apiVersion: v1 - kind: Namespace - metadata: - name: metallb-system - labels: - app: metallb - tags: [network, load-balancer] + - name: default-namespace + file: infra/namespace.yaml + parameters: + - NAMESPACE=default + - DOCKER_REGISTRY_HOSTNAME + - DOCKER_CONFIG_JSON_FILE_CONTENTS_BASE64 + - SLACK_BOT_ALERTING_CHANNEL + - INCLUDE_EXTERNAL_CERTS=true + - INCLUDE_BARE_DOMAIN=true + - POD_KILLER_CONTAINER_RESTART_LIMIT=10 + - JOB_RETENTION_WINDOW=1 month + fileParameters: + - UPDATE_SECRETS_SCRIPT=infra/certbot-copy-script.sh + - POD_KILLER_SCRIPT=infra/pod-killer-script.sh + - JOBS_SHELL_MONITOR_SCRIPT=infra/delete-manual-jobs-script.sh + tags: [namespaces] - name: dev-namespace - inline: | - apiVersion: v1 - kind: Namespace - metadata: - name: dev - tags: [rotate-node] + file: infra/namespace.yaml + parameters: + - NAMESPACE=dev + - DOCKER_REGISTRY_HOSTNAME + - DOCKER_CONFIG_JSON_FILE_CONTENTS_BASE64 + - SLACK_BOT_ALERTING_CHANNEL + - INCLUDE_EXTERNAL_CERTS=false + - INCLUDE_BARE_DOMAIN=false + - POD_KILLER_CONTAINER_RESTART_LIMIT=10 + - JOB_RETENTION_WINDOW=1 month + fileParameters: + - UPDATE_SECRETS_SCRIPT=infra/certbot-copy-script.sh + - POD_KILLER_SCRIPT=infra/pod-killer-script.sh + - JOBS_SHELL_MONITOR_SCRIPT=infra/delete-manual-jobs-script.sh + tags: [namespaces, rotate-node] + - name: monitoring-namespace + file: infra/namespace.yaml + parameters: + - NAMESPACE=monitoring + - DOCKER_REGISTRY_HOSTNAME + - DOCKER_CONFIG_JSON_FILE_CONTENTS_BASE64 + - SLACK_BOT_ALERTING_CHANNEL + - INCLUDE_EXTERNAL_CERTS=false + - INCLUDE_BARE_DOMAIN=false + - POD_KILLER_CONTAINER_RESTART_LIMIT=10 + - JOB_RETENTION_WINDOW=1 month + fileParameters: + - UPDATE_SECRETS_SCRIPT=infra/certbot-copy-script.sh + - POD_KILLER_SCRIPT=infra/pod-killer-script.sh + - JOBS_SHELL_MONITOR_SCRIPT=infra/delete-manual-jobs-script.sh + tags: [namespaces, monitoring] + - name: kube-system-namespace + file: infra/namespace.yaml + parameters: + - NAMESPACE=kube-system + - DOCKER_REGISTRY_HOSTNAME + - DOCKER_CONFIG_JSON_FILE_CONTENTS_BASE64 + - SLACK_BOT_ALERTING_CHANNEL + - INCLUDE_EXTERNAL_CERTS=false + - INCLUDE_BARE_DOMAIN=false + - POD_KILLER_CONTAINER_RESTART_LIMIT=10 + - JOB_RETENTION_WINDOW=1 month + fileParameters: + - UPDATE_SECRETS_SCRIPT=infra/certbot-copy-script.sh + - POD_KILLER_SCRIPT=infra/pod-killer-script.sh + - JOBS_SHELL_MONITOR_SCRIPT=infra/delete-manual-jobs-script.sh + tags: [namespaces] + - name: kubernetes-dashboard-namespace + file: infra/namespace.yaml + parameters: + - NAMESPACE=kubernetes-dashboard + - DOCKER_REGISTRY_HOSTNAME + - DOCKER_CONFIG_JSON_FILE_CONTENTS_BASE64 + - SLACK_BOT_ALERTING_CHANNEL + - INCLUDE_EXTERNAL_CERTS=false + - INCLUDE_BARE_DOMAIN=false + - POD_KILLER_CONTAINER_RESTART_LIMIT=10 + - JOB_RETENTION_WINDOW=1 month + fileParameters: + - UPDATE_SECRETS_SCRIPT=infra/certbot-copy-script.sh + - POD_KILLER_SCRIPT=infra/pod-killer-script.sh + - JOBS_SHELL_MONITOR_SCRIPT=infra/delete-manual-jobs-script.sh + tags: [namespaces] - name: tasks-namespace - inline: | - apiVersion: v1 - kind: Namespace - metadata: - name: tasks - tags: [rmq, tasks] + file: infra/namespace.yaml + parameters: + - NAMESPACE=tasks + - DOCKER_REGISTRY_HOSTNAME + - DOCKER_CONFIG_JSON_FILE_CONTENTS_BASE64 + - SLACK_BOT_ALERTING_CHANNEL + - INCLUDE_EXTERNAL_CERTS=false + - INCLUDE_BARE_DOMAIN=false + - POD_KILLER_CONTAINER_RESTART_LIMIT=10 + - JOB_RETENTION_WINDOW=1 month + fileParameters: + - UPDATE_SECRETS_SCRIPT=infra/certbot-copy-script.sh + - POD_KILLER_SCRIPT=infra/pod-killer-script.sh + - JOBS_SHELL_MONITOR_SCRIPT=infra/delete-manual-jobs-script.sh + tags: [namespaces, rmq, tasks] # endregion - name: calico file: calico.yaml @@ -422,41 +495,6 @@ resources: port: number: 443 tags: [apps, dashboard] - - name: cluster-registry-secrets-default - file: registry/registry-secrets.yaml - parameters: - - NAMESPACE=default - - DOCKER_REGISTRY_HOSTNAME - - DOCKER_CONFIG_JSON_FILE_CONTENTS_BASE64 - tags: [apps, registry] - - name: cluster-registry-secrets-dev - file: registry/registry-secrets.yaml - parameters: - - NAMESPACE=dev - - DOCKER_REGISTRY_HOSTNAME - - DOCKER_CONFIG_JSON_FILE_CONTENTS_BASE64 - tags: [apps, registry] - - name: cluster-registry-secrets-monitoring - file: registry/registry-secrets.yaml - parameters: - - NAMESPACE=monitoring - - DOCKER_REGISTRY_HOSTNAME - - DOCKER_CONFIG_JSON_FILE_CONTENTS_BASE64 - tags: [apps, registry] - - name: cluster-registry-secrets-kube-system - file: registry/registry-secrets.yaml - parameters: - - NAMESPACE=kube-system - - DOCKER_REGISTRY_HOSTNAME - - DOCKER_CONFIG_JSON_FILE_CONTENTS_BASE64 - tags: [apps, registry] - - name: cluster-registry-secrets-tasks - file: registry/registry-secrets.yaml - parameters: - - NAMESPACE=tasks - - DOCKER_REGISTRY_HOSTNAME - - DOCKER_CONFIG_JSON_FILE_CONTENTS_BASE64 - tags: [apps, registry] - name: docker-registry-htpasswd-secrets inline: | apiVersion: v1 @@ -1191,49 +1229,6 @@ resources: file: drone/drone.yaml tags: [apps, drone] # endregion - - name: pod-killer - file: pod-killer/pod-killer.yaml - parameters: - - POD_KILLER_NAMESPACE=default - - SLACK_BOT_ALERTING_CHANNEL - fileParameters: - - POD_KILLER_SCRIPT=pod-killer/pod-killer.sh - - name: certbot-update-kubernetes-dashboard - file: certbot/certbot-generic-cron.yaml - parameters: - - KUBERNETES_NAMESPACE=kubernetes-dashboard - - INCLUDE_EXTERNAL_CERTS=false - - INCLUDE_BARE_DOMAIN=false - fileParameters: - - UPDATE_SECRETS_SCRIPT=certbot/certbot-copy-script.sh - tags: [crons, certbot] - - name: certbot-update-monitoring - file: certbot/certbot-generic-cron.yaml - parameters: - - KUBERNETES_NAMESPACE=monitoring - - INCLUDE_EXTERNAL_CERTS=false - - INCLUDE_BARE_DOMAIN=false - fileParameters: - - UPDATE_SECRETS_SCRIPT=certbot/certbot-copy-script.sh - tags: [crons, certbot] - - name: certbot-update-default - file: certbot/certbot-generic-cron.yaml - parameters: - - KUBERNETES_NAMESPACE=default - - INCLUDE_EXTERNAL_CERTS=true - - INCLUDE_BARE_DOMAIN=true - fileParameters: - - UPDATE_SECRETS_SCRIPT=certbot/certbot-copy-script.sh - tags: [crons, certbot] - - name: certbot-update-tasks - file: certbot/certbot-generic-cron.yaml - parameters: - - KUBERNETES_NAMESPACE=tasks - - INCLUDE_EXTERNAL_CERTS=false - - INCLUDE_BARE_DOMAIN=false - fileParameters: - - UPDATE_SECRETS_SCRIPT=certbot/certbot-copy-script.sh - tags: [crons, certbot] - name: certbot-cron file: certbot/certbot.yaml fileParameters: diff --git a/infra/README.md b/infra/README.md new file mode 100644 index 0000000..95233f9 --- /dev/null +++ b/infra/README.md @@ -0,0 +1,7 @@ +Various templates and such for managing things that keep the cluster neat. + +Each namespace ships with: +- A set of docker registry secrets +- A `CronJob` to copy SSL certs from filesystem to Kubernetes `Secret` +- A `CronJob` to kill pods whose containers have > 10 restarts +- A `CronJob` to delete old manual job runs diff --git a/certbot/certbot-copy-script.sh b/infra/certbot-copy-script.sh similarity index 100% rename from certbot/certbot-copy-script.sh rename to infra/certbot-copy-script.sh diff --git a/infra/delete-manual-jobs-script.sh b/infra/delete-manual-jobs-script.sh new file mode 100644 index 0000000..b06e8d9 --- /dev/null +++ b/infra/delete-manual-jobs-script.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env sh +# +# Delete jobs that appear to be CronJobs that have been run manually from the +# Kubernetes Dashboard. +set -euf + +SLACK_URL="https://slackbot.internal.aleemhaji.com/message" + +slack() { + curl -sS -X POST -H "X-SLACK-CHANNEL-ID: ${SLACK_BOT_ALERTING_CHANNEL}" -d "$@" "$SLACK_URL" +} + +if [ $# -ne 2 ]; then + echo >&2 "Usage:" + echo >&2 " $0 " + exit 1 +fi + +namespace="$1" +age_str="$2" + +MANUAL_JOB_REGEXP='-manual-[[:alnum:]]\{3,5\}[[:space:]]' +ONE_MONTH_AGO="$(date -u -d "$age_str ago" '+%Y-%m-%dT%H:%M:%SZ')" +JOBS_COLUMNS='custom-columns=NAME:{.metadata.name},SUCCEEDED:{.status.succeeded},COMPLETED:{.status.completionTime}' +# shellcheck disable=SC2016 +AWK_SCRIPT='{if ($2 == 1 && $3 < arg) print $1}' + +slack 'Manual job-run cleaup running on '"$(hostname)"'. +Deleting successful manually run jobs in namespace "'"$namespace"'" older than '"$age_str"'.' + +while true; do + echo "Run: $(date)" + kubectl -n "$namespace" get jobs -o "$JOBS_COLUMNS" | sed '1d' | grep -- "$MANUAL_JOB_REGEXP" | awk -v "arg=$ONE_MONTH_AGO" "$AWK_SCRIPT" | while read -r job; do + slack "Job monitor deleting old manually run job: $job" + kubectl -n "${namespace}" delete job "$job" + done + + sleep 3600 +done diff --git a/infra/namespace.yaml b/infra/namespace.yaml new file mode 100644 index 0000000..cc1d5b4 --- /dev/null +++ b/infra/namespace.yaml @@ -0,0 +1,263 @@ +# region: Namespace +apiVersion: v1 +kind: Namespace +metadata: + name: "${NAMESPACE}" +--- +# endregion +# region: Docker Secrets +apiVersion: v1 +kind: Secret +metadata: + name: ${DOCKER_REGISTRY_HOSTNAME} + namespace: ${NAMESPACE} +type: kubernetes.io/dockerconfigjson +data: + .dockerconfigjson: ${DOCKER_CONFIG_JSON_FILE_CONTENTS_BASE64} +--- +# endregoin +# region: Certificate Management +apiVersion: v1 +kind: ServiceAccount +metadata: + name: certbot + namespace: ${NAMESPACE} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: certbot + namespace: ${NAMESPACE} +rules: + - apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "list", "patch", "create"] # Might actually be able to remove "list"? +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: certbot-manage-secrets + namespace: ${NAMESPACE} +subjects: + - kind: ServiceAccount + name: certbot + namespace: ${NAMESPACE} +roleRef: + kind: Role + name: certbot + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: certbot-update-script + namespace: ${NAMESPACE} +binaryData: + update-secrets.sh: "${UPDATE_SECRETS_SCRIPT}" +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: certbot-copy + namespace: ${NAMESPACE} +spec: + schedule: "0 0 * * *" + successfulJobsHistoryLimit: 1 + failedJobsHistoryLimit: 3 + concurrencyPolicy: Forbid + jobTemplate: + spec: + template: + metadata: + labels: + app: certbot-copy + spec: + restartPolicy: OnFailure + serviceAccountName: certbot + imagePullSecrets: + - name: registry.internal.aleemhaji.com + containers: + - name: certbot-copy + image: registry.internal.aleemhaji.com/kubectl:1.21.0 + command: + - sh + - /scripts/update-secrets.sh + env: + - name: KUBERNETES_NAMESPACE + value: ${NAMESPACE} + - name: INCLUDE_EXTERNAL_CERTS + value: "${INCLUDE_EXTERNAL_CERTS}" + - name: INCLUDE_BARE_DOMAIN + value: "${INCLUDE_BARE_DOMAIN}" + volumeMounts: + - name: certbot-storage + mountPath: /etc/letsencrypt + - name: certbot-scripts + mountPath: /scripts + volumes: + - name: certbot-storage + nfs: + server: 192.168.96.4 + path: /mnt/main/apps/certificates + - name: certbot-scripts + configMap: + name: certbot-update-script + defaultMode: 0755 +--- +# endregion +# region: Pod Killer +apiVersion: v1 +kind: ServiceAccount +metadata: + name: pod-killer + namespace: ${NAMESPACE} +--- +kind: Role +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: pod-killer + namespace: ${NAMESPACE} +rules: + - apiGroups: [""] + resources: [pods] + verbs: [get, delete, list, watch] +--- +kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: pod-killer + namespace: ${NAMESPACE} +subjects: + - kind: ServiceAccount + name: pod-killer +roleRef: + kind: Role + name: pod-killer + apiGroup: rbac.authorization.k8s.io +--- +kind: ConfigMap +apiVersion: v1 +metadata: + labels: + app: pod-killer + name: pod-killer-config + namespace: ${NAMESPACE} +binaryData: + script.sh: ${POD_KILLER_SCRIPT} +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: pod-killer + namespace: ${NAMESPACE} + labels: + app: pod-killer +spec: + revisionHistoryLimit: 0 + replicas: 1 + selector: + matchLabels: + app: pod-killer + template: + metadata: + labels: + app: pod-killer + spec: + serviceAccountName: pod-killer + imagePullSecrets: + - name: registry.internal.aleemhaji.com + containers: + - name: pod-killer + image: registry.internal.aleemhaji.com/kubectl:1.21.0 + command: + - /scripts/script.sh + - "${NAMESPACE}" + - "${POD_KILLER_CONTAINER_RESTART_LIMIT}" + volumeMounts: + - name: pod-killer-scripts + mountPath: /scripts + volumes: + - name: pod-killer-scripts + configMap: + name: pod-killer-config + defaultMode: 0755 +--- +# endregion +# region: Job Cleaner +apiVersion: v1 +kind: ServiceAccount +metadata: + name: jobs-monitor + namespace: ${NAMESPACE} +--- +kind: Role +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: jobs-monitor + namespace: ${NAMESPACE} +rules: + - apiGroups: [batch] + resources: [jobs] + verbs: [list, delete] +--- +kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: jobs-monitor + namespace: ${NAMESPACE} +subjects: + - kind: ServiceAccount + name: jobs-monitor +roleRef: + kind: Role + name: jobs-monitor + apiGroup: rbac.authorization.k8s.io +--- +kind: ConfigMap +apiVersion: v1 +metadata: + name: job-monitor-script + namespace: ${NAMESPACE} + labels: + app: jobs-monitor +binaryData: + script.sh: ${JOBS_SHELL_MONITOR_SCRIPT} +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: jobs-cleaner + namespace: ${NAMESPACE} + labels: + app: jobs-cleaner +spec: + revisionHistoryLimit: 0 + replicas: 1 + selector: + matchLabels: + app: jobs-cleaner + template: + metadata: + labels: + app: jobs-cleaner + spec: + serviceAccountName: jobs-monitor + imagePullSecrets: + - name: registry.internal.aleemhaji.com + containers: + - name: jobs-cleaner + image: registry.internal.aleemhaji.com/kubectl:1.21.0 + command: + - /scripts/script.sh + - "${NAMESPACE}" + - "${JOB_RETENTION_WINDOW}" + volumeMounts: + - name: job-monitor-script + mountPath: /scripts + volumes: + - name: job-monitor-script + configMap: + name: job-monitor-script + defaultMode: 0755 +--- +# endregion diff --git a/pod-killer/pod-killer.sh b/infra/pod-killer-script.sh similarity index 52% rename from pod-killer/pod-killer.sh rename to infra/pod-killer-script.sh index 8527c7f..3a95f91 100644 --- a/pod-killer/pod-killer.sh +++ b/infra/pod-killer-script.sh @@ -10,23 +10,26 @@ slack() { curl -sS -X POST -H "X-SLACK-CHANNEL-ID: ${SLACK_BOT_ALERTING_CHANNEL}" -d "$@" "$SLACK_URL" } -if [ $# -ne 1 ]; then +if [ $# -ne 2 ]; then echo >&2 "Usage:" - echo >&2 " $0 " + echo >&2 " $0 " exit 1 fi +namespace="$1" +n_restarts="$2" # shellcheck disable=SC2016 pod_template='{{range .items}}{{$name := .metadata.name}}{{range .status.containerStatuses}}{{$name}} {{.restartCount}} {{end}}{{end}}' slack 'Pod killer starting up on '"$(hostname)"'. -Killing pods with '"$1"' or more container restarts.' +Killing pods in namespace "'"$namespace"'" with '"$n_restarts"' or more container restarts.' while true; do - kubectl get pods -n "${POD_KILLER_NAMESPACE}" -o template="$pod_template" | awk '{for (i = 0; i < $2; i++) print $1}' | uniq -c | awk '$1 >= '"$1"' { print $2 }' | while read -r pod; do - slack "Pod killer is killing \"${POD_KILLER_NAMESPACE}/$pod\"" - kubectl delete pod -n "${POD_KILLER_NAMESPACE}" "$pod" + echo "Run: $(date)" + kubectl get pods -n "${namespace}" -o template="$pod_template" | awk '{for (i = 0; i < $2; i++) print $1}' | uniq -c | awk '$1 >= '"$n_restarts"' { print $2 }' | while read -r pod; do + slack "Pod killer is killing \"${namespace}/$pod\"" + kubectl delete pod -n "${namespace}" "$pod" done sleep 60 diff --git a/pod-killer/pod-killer.yaml b/pod-killer/pod-killer.yaml deleted file mode 100644 index 9e7376e..0000000 --- a/pod-killer/pod-killer.yaml +++ /dev/null @@ -1,86 +0,0 @@ -# Simple shell script to kill pods that have been restarted more than a -# specified number of times. -# May be able to help recover pods that have been allocated to a node that's -# started seeing some condition that makes it fail, like an NFS share that -# needs to be remounted. -apiVersion: v1 -kind: ServiceAccount -metadata: - name: pod-killer - namespace: ${POD_KILLER_NAMESPACE} ---- -kind: Role -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: pod-killer - namespace: ${POD_KILLER_NAMESPACE} -rules: - - apiGroups: - - "" - resources: - - pods - verbs: - - get - - delete - - list - - watch ---- -kind: RoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: pod-killer - namespace: ${POD_KILLER_NAMESPACE} -subjects: - - kind: ServiceAccount - name: pod-killer -roleRef: - kind: Role - name: pod-killer - apiGroup: rbac.authorization.k8s.io ---- -kind: ConfigMap -apiVersion: v1 -metadata: - labels: - app: pod-killer - name: pod-killer-config - namespace: ${POD_KILLER_NAMESPACE} -binaryData: - script.sh: ${POD_KILLER_SCRIPT} - ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: pod-killer-deployment - namespace: ${POD_KILLER_NAMESPACE} - labels: - app: pod-killer -spec: - revisionHistoryLimit: 0 - replicas: 1 - selector: - matchLabels: - app: pod-killer - template: - metadata: - labels: - app: pod-killer - spec: - serviceAccountName: pod-killer - imagePullSecrets: - - name: registry.internal.aleemhaji.com - containers: - - name: pod-killer - image: registry.internal.aleemhaji.com/kubectl:1.21.0 - command: - - /scripts/script.sh - - "10" - volumeMounts: - - name: pod-killer-scripts - mountPath: /scripts - volumes: - - name: pod-killer-scripts - configMap: - name: pod-killer-config - defaultMode: 0755 diff --git a/registry/registry-secrets.yaml b/registry/registry-secrets.yaml deleted file mode 100644 index 3612261..0000000 --- a/registry/registry-secrets.yaml +++ /dev/null @@ -1,8 +0,0 @@ -apiVersion: v1 -data: - .dockerconfigjson: ${DOCKER_CONFIG_JSON_FILE_CONTENTS_BASE64} -kind: Secret -metadata: - name: ${DOCKER_REGISTRY_HOSTNAME} - namespace: ${NAMESPACE} -type: kubernetes.io/dockerconfigjson diff --git a/shell-monitor/delete-manual-jobs-monitor.sh b/shell-monitor/delete-manual-jobs-monitor.sh deleted file mode 100644 index 30b732c..0000000 --- a/shell-monitor/delete-manual-jobs-monitor.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env sh -# -# Delete jobs that appear to be CronJobs that have been run manually from the -# Kubernetes Dashboard. -set -euf - -MANUAL_JOB_REGEXP='-manual-[[:alnum:]]\{3\}[[:space:]]' -ONE_MONTH_AGO="$(date -u -d '1 month ago' '+%Y-%m-%dT%H:%M:%SZ')" - -# Newline to start the monitored output. -echo "" - -kubectl -n "$KUBE_NAMESPACE" get jobs -o custom-columns="NAME:{.metadata.name},SUCCEEDED:{.status.succeeded},COMPLETED:{.status.completionTime}" | \ - sed '1d' |\ - grep -- "$MANUAL_JOB_REGEXP" |\ - awk '{if ($2 == 1) print}' |\ - awk -v "arg=$ONE_MONTH_AGO" '{if ($3 < arg) print $1}' |\ - while read -r job; do - echo "Kubernetes job monitor deleting old manually run job: $job" - kubectl -n "${KUBE_NAMESPACE}" delete job "$job" -done diff --git a/shell-monitor/delete-manual-jobs-monitor.yaml b/shell-monitor/delete-manual-jobs-monitor.yaml deleted file mode 100644 index 0334ca4..0000000 --- a/shell-monitor/delete-manual-jobs-monitor.yaml +++ /dev/null @@ -1,105 +0,0 @@ -apiVersion: v1 -kind: ServiceAccount -metadata: - name: jobs-monitor - namespace: ${KUBE_NAMESPACE} ---- -kind: Role -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: jobs-monitor - namespace: ${KUBE_NAMESPACE} -rules: - - apiGroups: - - "batch" - resources: - - jobs - verbs: - - list - - delete ---- -kind: RoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: jobs-monitor-shell-monitor - namespace: ${KUBE_NAMESPACE} -subjects: - - kind: ServiceAccount - name: jobs-monitor -roleRef: - kind: Role - name: shell-monitor - apiGroup: rbac.authorization.k8s.io ---- -kind: RoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: jobs-monitor-jobs-monitor - namespace: ${KUBE_NAMESPACE} -subjects: - - kind: ServiceAccount - name: jobs-monitor -roleRef: - kind: Role - name: jobs-monitor - apiGroup: rbac.authorization.k8s.io ---- -kind: ConfigMap -apiVersion: v1 -metadata: - name: job-monitor-script - namespace: ${KUBE_NAMESPACE} - labels: - app: jobs-shell-monitor -binaryData: - script.sh: ${JOBS_SHELL_MONITOR_SCRIPT} ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: jobs-shell-monitor-deployment - namespace: ${KUBE_NAMESPACE} - labels: - app: jobs-shell-monitor -spec: - revisionHistoryLimit: 0 - replicas: 1 - selector: - matchLabels: - app: jobs-shell-monitor - template: - metadata: - labels: - app: jobs-shell-monitor - spec: - serviceAccountName: jobs-monitor - imagePullSecrets: - - name: registry.internal.aleemhaji.com - containers: - - name: shell-monitor - image: registry.internal.aleemhaji.com/kubectl:1.21.0 - command: - - /scripts/base/script.sh - env: - - name: CONFIG_MAP_NAME - value: recent-jobs-monitor - - name: UPDATE_SCRIPT - value: /scripts/monitor/script.sh - - name: UPDATE_INTERVAL - value: "3600" - - name: SLACK_BOT_ALERTING_CHANNEL - value: ${SLACK_BOT_ALERTING_CHANNEL} - volumeMounts: - - name: shell-monitor-base-scripts - mountPath: /scripts/base - - name: job-monitor-script - mountPath: /scripts/monitor - volumes: - - name: shell-monitor-base-scripts - configMap: - name: shell-monitor-base-config - defaultMode: 0755 - - name: job-monitor-script - configMap: - name: job-monitor-script - defaultMode: 0755