From e89864fcd181e2af34bcdb6f9f3d17a9f1eed9df Mon Sep 17 00:00:00 2001
From: bsctl <adriano@clastix.io>
Date: Tue, 20 Aug 2024 13:44:34 +0200
Subject: [PATCH] feat: use defrag community script

---
 README.md                                     |  6 +-
 charts/kamaji-etcd/README.md                  |  2 -
 .../templates/etcd_cronjob_defrag.yaml        | 66 ---------------
 charts/kamaji-etcd/values.yaml                |  5 --
 docs/defragmentation.md                       | 49 +++++++++++
 scripts/defrag.sh                             | 83 +++++++++++++++++++
 6 files changed, 136 insertions(+), 75 deletions(-)
 delete mode 100644 charts/kamaji-etcd/templates/etcd_cronjob_defrag.yaml
 create mode 100644 docs/defragmentation.md
 create mode 100755 scripts/defrag.sh

diff --git a/README.md b/README.md
index 7eb61f9..bec9582 100644
--- a/README.md
+++ b/README.md
@@ -11,9 +11,11 @@ A multi-tenant deployment for `etcd` is not common practice. However, `etcd` pro
 ## Documentation
 Refer to the [etcd documentation](https://etcd.io/docs/v3.5/op-guide). Following sections provide additional procedures to help with a specific setup as it is used into project [Kamaji](https://github.com/clastix/kamaji).
 
-- [Backup and restore from snapshot](docs/snapshot-recovery.md)
-- [Disaster Recovery with Velero](docs/velero.md)
+- [Taking Snapshots](docs/snapshot.md)
+- [Recover from Snapshot](docs/snapshot-recovery.md)
+- [Velero](docs/velero.md)
 - [Rotate Certificates](docs/rotate-certificates.md)
+- [Defragmenting Data](docs/defragmentation.md)
 - [Performance and Optimization](docs/performance-and-optimization.md)
 
 ## Roadmap
diff --git a/charts/kamaji-etcd/README.md b/charts/kamaji-etcd/README.md
index a594bda..3792dd2 100644
--- a/charts/kamaji-etcd/README.md
+++ b/charts/kamaji-etcd/README.md
@@ -67,8 +67,6 @@ Here the values you can override:
 | clusterDomain | string | `"cluster.local"` | Domain of the Kubernetes cluster. |
 | datastore.enabled | bool | `false` | Create a datastore custom resource for Kamaji |
 | datastore.name | string | `""` | Name of Kamaji datastore, set to fully qualified etcd name when null or not provided |
-| defragmentation | object | `{"schedule":"0 0 * * *"}` | Enable storage defragmentation  |
-| defragmentation.schedule | string | `"0 0 * * *"` | The job scheduled maintenance time for defrag (empty to disable) |
 | extraArgs | list | `[]` | A list of extra arguments to add to the etcd default ones |
 | fullnameOverride | string | `""` |  |
 | image.pullPolicy | string | `"IfNotPresent"` | Pull policy to use |
diff --git a/charts/kamaji-etcd/templates/etcd_cronjob_defrag.yaml b/charts/kamaji-etcd/templates/etcd_cronjob_defrag.yaml
deleted file mode 100644
index f962cb2..0000000
--- a/charts/kamaji-etcd/templates/etcd_cronjob_defrag.yaml
+++ /dev/null
@@ -1,66 +0,0 @@
-{{- if .Values.defragmentation.schedule -}}
-apiVersion: batch/v1
-kind: CronJob
-metadata:
-  labels:
-    {{- include "etcd.labels" . | nindent 4 }}
-  name: "{{ .Release.Name }}-defrag"
-  namespace: {{ .Release.Namespace }}
-spec:
-  schedule: "{{ .Values.defragmentation.schedule }}"
-  successfulJobsHistoryLimit: 4
-  jobTemplate:
-    spec:
-      template:
-        spec:
-          {{- with .Values.imagePullSecrets }}
-          imagePullSecrets:
-            {{- toYaml . | nindent 12 }}
-          {{- end }}
-          serviceAccountName: {{ include "etcd.serviceAccountName" . }}
-          restartPolicy: OnFailure
-          containers:
-          - name: etcd-client
-            image: {{ include "etcd.fullyQualifiedDockerImage" . }}
-            imagePullPolicy: {{ .Values.image.pullPolicy }}
-            command:
-              - bash
-              - -c
-              - |-
-                for ENDPOINT in {{ include "etcd.endpoints" . }}; do
-                  etcdctl --endpoints=https://${ENDPOINT} defrag;
-                  etcdctl --endpoints=https://${ENDPOINT} alarm disarm;
-                  etcdctl --endpoints=https://${ENDPOINT} alarm list;
-                  etcdctl --endpoints=https://${ENDPOINT} endpoint status -w table;
-                  etcdctl --endpoints=https://${ENDPOINT} member list -w table;
-                  sleep 15;
-                done;
-            env:
-            - name: ETCDCTL_CACERT
-              value: /opt/certs/ca/ca.crt
-            - name: ETCDCTL_CERT
-              value: /opt/certs/root-client-certs/tls.crt
-            - name: ETCDCTL_KEY
-              value: /opt/certs/root-client-certs/tls.key
-            volumeMounts:
-            - name: root-client-certs
-              mountPath: /opt/certs/root-client-certs
-            - name: certs
-              mountPath: /opt/certs/ca
-          securityContext:
-            runAsUser: 1000
-            runAsGroup: 1000
-            fsGroup: 1000
-          {{- with .Values.tolerations }}
-          tolerations: {{- toYaml . | nindent 12 }}
-          {{- end }}
-          volumes:
-          - name: root-client-certs
-            secret:
-              secretName: {{ include "etcd.clientSecretName" . }}
-              optional: true
-          - name: certs
-            secret:
-              secretName: {{ include "etcd.caSecretName" . }}
-              optional: true
-{{- end }}
diff --git a/charts/kamaji-etcd/values.yaml b/charts/kamaji-etcd/values.yaml
index 39f7991..17431e8 100644
--- a/charts/kamaji-etcd/values.yaml
+++ b/charts/kamaji-etcd/values.yaml
@@ -76,11 +76,6 @@ persistentVolumeClaim:
   customAnnotations: {}
   #  volumeType: local
 
-# -- Enable storage defragmentation 
-defragmentation:
-  # -- The job scheduled maintenance time for defrag (empty to disable)
-  schedule: "0 0 * * *"  # Default cron schedule (daily at midnight), see https://crontab.guru/
-
 # -- Labels to add to all etcd pods
 podLabels:
   application: kamaji-etcd
diff --git a/docs/defragmentation.md b/docs/defragmentation.md
new file mode 100644
index 0000000..7193e44
--- /dev/null
+++ b/docs/defragmentation.md
@@ -0,0 +1,49 @@
+# Defragmenting Data
+For dense Kubernetes clusters, `etcd` can suffer from poor performance if the keyspace grows too large and exceeds the space quota. Periodically maintain and defragment `etcd` to free up space in the data store. See details [here](https://etcd.io/docs/v3.5/op-guide/maintenance/).
+
+Monitor Prometheus for `etcd` metrics and defragment it when required, otherwise, `etcd` can raise a cluster-wide alarm that puts the cluster into a maintenance mode accepting only key reads and deletes.
+
+To keep track of defragmentation requirements, monitor these key metrics:
+
+- `etcd_server_quota_backend_bytes`: which is the current quota limit
+- `etcd_mvcc_db_total_size_in_use_in_bytes`: which indicates the actual database usage after a history compaction
+- `etcd_mvcc_db_total_size_in_bytes`, which shows the database size, including free space waiting for defragmentation
+
+You can also determine whether defragmentation is needed by checking the `etcd` database size in MB that will be freed by defragmentation with the PromQL expression:
+
+- `(etcd_mvcc_db_total_size_in_bytes - etcd_mvcc_db_total_size_in_use_in_bytes)/1024/1024`
+
+Defragmentation is an expensive operation, so it should be executed as infrequently as possible. On the other hand, it's also necessary to make sure any `etcd` member will not exceed the storage quota. The Kubernetes project recommends that when you perform defragmentation, you use a tool such as [etcd-defrag](https://github.com/ahrtr/etcd-defrag).
+
+The `defrag.sh` script is designed to create and schedule jobs for periodically defragment data on a `kamaji-etcd` instance. The script generates Kubernetes CronJob manifests and applies them to the specified namespace. Make sure you set the defragmentation criteria according to your environment needs. 
+
+
+## Usage
+To run the script, use the following command:
+
+```bash
+./defrag.sh [-e etcd_name] [-s etcd_service] [-n etcd_namespace] [-j schedule]
+```
+
+## Parameters
+
+- `-e etcd_name`: Name of the etcd StatefulSet (default: `kamaji-etcd`)
+- `-s etcd_service`: Name of the etcd service (default: `kamaji-etcd`)
+- `-n etcd_namespace`: Namespace of the etcd StatefulSet (default: `kamaji-system`)
+- `-j schedule`: Cron schedule for the defrag job (default: `"0 0 * * *"`, which means daily at midnight)
+
+## Example
+
+To run the script with custom parameters:
+
+```bash
+./defrag.sh -e kamaji-etcd -s kamaji-etcd -n kamaji-system -j "14 9 * * 1-5"
+```
+This will create a Kubernetes CronJob manifest with the specified parameters and apply it to the cluster.
+
+## Debug mode
+To run the script in debug mode set the environment variable `DEBUG`:
+
+``` bash
+export DEBUG=1
+```
diff --git a/scripts/defrag.sh b/scripts/defrag.sh
new file mode 100755
index 0000000..ff8352b
--- /dev/null
+++ b/scripts/defrag.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+
+# Enable debugging, exit on errors, and ensure the script fails if any command in a pipeline fails
+if [ "${DEBUG}" = 1 ]; then
+    set -x
+fi
+set -eu -o pipefail
+
+# Default values for the parameters
+ETCD_NAME="kamaji-etcd"
+ETCD_SERVICE="kamaji-etcd"
+ETCD_NAMESPACE="kamaji-system"
+SCHEDULE="0 0 * * *"  # every day at midnight
+
+# Parse script parameters
+while getopts "e:s:n:j:" opt; do
+  case ${opt} in
+    e ) ETCD_NAME=$OPTARG ;;
+    s ) ETCD_SERVICE=$OPTARG ;;
+    n ) ETCD_NAMESPACE=$OPTARG ;;
+    j ) SCHEDULE=$OPTARG ;;
+    \? ) echo "Usage: ./defrag.sh [-e etcd_name] [-s etcd_service] [-n etcd_namespace] [-j schedule]"
+         exit 1 ;;
+  esac
+done
+
+# Function to create the CronJob manifest for defrag etcd
+create_defrag_cronjob() {
+  local etcd_name=$1
+  local etcd_service=$2
+  local etcd_namespace=$3
+  local schedule=$4  # Add a parameter for the cron schedule
+
+  cat <<EOF > ${etcd_name}-defrag-job.yaml
+apiVersion: batch/v1
+kind: CronJob
+metadata:
+  name: ${etcd_name}-defrag-job
+  namespace: $etcd_namespace
+spec:
+  schedule: "$schedule"  # Use the provided schedule
+  jobTemplate:
+    spec:
+      template:
+        spec:
+          containers:
+          - name: etcd-defrag
+            image: ghcr.io/ahrtr/etcd-defrag:v0.15.0 # Please replace the version with the latest version.
+            args:
+            - --endpoints=https://${etcd_name}-0.${etcd_service}.${etcd_namespace}.svc.cluster.local:2379,https://${etcd_name}-1.${etcd_service}.${etcd_namespace}.svc.cluster.local:2379,https://${etcd_name}-2.${etcd_service}.${etcd_namespace}.svc.cluster.local:2379
+            - --cacert=/opt/certs/ca/ca.crt
+            - --cert=/opt/certs/root-client-certs/tls.crt
+            - --key=/opt/certs/root-client-certs/tls.key
+            - --cluster
+            - --defrag-rule
+            - "dbQuotaUsage > 0.8 || dbSize - dbSizeInUse > 200*1024*1024"
+            volumeMounts:
+            - mountPath: /opt/certs/root-client-certs
+              name: root-client-certs
+            - mountPath: /opt/certs/ca
+              name: certs
+          restartPolicy: OnFailure
+          securityContext:
+            runAsUser: 0
+          volumes:
+          - name: root-client-certs
+            secret:
+              secretName: ${etcd_name}-root-client-certs
+          - name: certs
+            secret:
+              secretName: ${etcd_name}-certs
+EOF
+}
+
+# Main script to defrag etcd
+main() {
+  # Create and apply defrag CronJob
+    create_defrag_cronjob "$ETCD_NAME" "$ETCD_SERVICE" "$ETCD_NAMESPACE" "$SCHEDULE"
+    kubectl apply -f $ETCD_NAME-defrag-job.yaml
+}
+
+# Execute the main script
+main
\ No newline at end of file