From e89864fcd181e2af34bcdb6f9f3d17a9f1eed9df Mon Sep 17 00:00:00 2001 From: bsctl Date: Tue, 20 Aug 2024 13:44:34 +0200 Subject: [PATCH] feat: use defrag community script --- README.md | 6 +- charts/kamaji-etcd/README.md | 2 - .../templates/etcd_cronjob_defrag.yaml | 66 --------------- charts/kamaji-etcd/values.yaml | 5 -- docs/defragmentation.md | 49 +++++++++++ scripts/defrag.sh | 83 +++++++++++++++++++ 6 files changed, 136 insertions(+), 75 deletions(-) delete mode 100644 charts/kamaji-etcd/templates/etcd_cronjob_defrag.yaml create mode 100644 docs/defragmentation.md create mode 100755 scripts/defrag.sh diff --git a/README.md b/README.md index 7eb61f9..bec9582 100644 --- a/README.md +++ b/README.md @@ -11,9 +11,11 @@ A multi-tenant deployment for `etcd` is not common practice. However, `etcd` pro ## Documentation Refer to the [etcd documentation](https://etcd.io/docs/v3.5/op-guide). Following sections provide additional procedures to help with a specific setup as it is used into project [Kamaji](https://github.com/clastix/kamaji). -- [Backup and restore from snapshot](docs/snapshot-recovery.md) -- [Disaster Recovery with Velero](docs/velero.md) +- [Taking Snapshots](docs/snapshot.md) +- [Recover from Snapshot](docs/snapshot-recovery.md) +- [Velero](docs/velero.md) - [Rotate Certificates](docs/rotate-certificates.md) +- [Defragmenting Data](docs/defragmentation.md) - [Performance and Optimization](docs/performance-and-optimization.md) ## Roadmap diff --git a/charts/kamaji-etcd/README.md b/charts/kamaji-etcd/README.md index a594bda..3792dd2 100644 --- a/charts/kamaji-etcd/README.md +++ b/charts/kamaji-etcd/README.md @@ -67,8 +67,6 @@ Here the values you can override: | clusterDomain | string | `"cluster.local"` | Domain of the Kubernetes cluster. | | datastore.enabled | bool | `false` | Create a datastore custom resource for Kamaji | | datastore.name | string | `""` | Name of Kamaji datastore, set to fully qualified etcd name when null or not provided | -| defragmentation | object | `{"schedule":"0 0 * * *"}` | Enable storage defragmentation | -| defragmentation.schedule | string | `"0 0 * * *"` | The job scheduled maintenance time for defrag (empty to disable) | | extraArgs | list | `[]` | A list of extra arguments to add to the etcd default ones | | fullnameOverride | string | `""` | | | image.pullPolicy | string | `"IfNotPresent"` | Pull policy to use | diff --git a/charts/kamaji-etcd/templates/etcd_cronjob_defrag.yaml b/charts/kamaji-etcd/templates/etcd_cronjob_defrag.yaml deleted file mode 100644 index f962cb2..0000000 --- a/charts/kamaji-etcd/templates/etcd_cronjob_defrag.yaml +++ /dev/null @@ -1,66 +0,0 @@ -{{- if .Values.defragmentation.schedule -}} -apiVersion: batch/v1 -kind: CronJob -metadata: - labels: - {{- include "etcd.labels" . | nindent 4 }} - name: "{{ .Release.Name }}-defrag" - namespace: {{ .Release.Namespace }} -spec: - schedule: "{{ .Values.defragmentation.schedule }}" - successfulJobsHistoryLimit: 4 - jobTemplate: - spec: - template: - spec: - {{- with .Values.imagePullSecrets }} - imagePullSecrets: - {{- toYaml . | nindent 12 }} - {{- end }} - serviceAccountName: {{ include "etcd.serviceAccountName" . }} - restartPolicy: OnFailure - containers: - - name: etcd-client - image: {{ include "etcd.fullyQualifiedDockerImage" . }} - imagePullPolicy: {{ .Values.image.pullPolicy }} - command: - - bash - - -c - - |- - for ENDPOINT in {{ include "etcd.endpoints" . }}; do - etcdctl --endpoints=https://${ENDPOINT} defrag; - etcdctl --endpoints=https://${ENDPOINT} alarm disarm; - etcdctl --endpoints=https://${ENDPOINT} alarm list; - etcdctl --endpoints=https://${ENDPOINT} endpoint status -w table; - etcdctl --endpoints=https://${ENDPOINT} member list -w table; - sleep 15; - done; - env: - - name: ETCDCTL_CACERT - value: /opt/certs/ca/ca.crt - - name: ETCDCTL_CERT - value: /opt/certs/root-client-certs/tls.crt - - name: ETCDCTL_KEY - value: /opt/certs/root-client-certs/tls.key - volumeMounts: - - name: root-client-certs - mountPath: /opt/certs/root-client-certs - - name: certs - mountPath: /opt/certs/ca - securityContext: - runAsUser: 1000 - runAsGroup: 1000 - fsGroup: 1000 - {{- with .Values.tolerations }} - tolerations: {{- toYaml . | nindent 12 }} - {{- end }} - volumes: - - name: root-client-certs - secret: - secretName: {{ include "etcd.clientSecretName" . }} - optional: true - - name: certs - secret: - secretName: {{ include "etcd.caSecretName" . }} - optional: true -{{- end }} diff --git a/charts/kamaji-etcd/values.yaml b/charts/kamaji-etcd/values.yaml index 39f7991..17431e8 100644 --- a/charts/kamaji-etcd/values.yaml +++ b/charts/kamaji-etcd/values.yaml @@ -76,11 +76,6 @@ persistentVolumeClaim: customAnnotations: {} # volumeType: local -# -- Enable storage defragmentation -defragmentation: - # -- The job scheduled maintenance time for defrag (empty to disable) - schedule: "0 0 * * *" # Default cron schedule (daily at midnight), see https://crontab.guru/ - # -- Labels to add to all etcd pods podLabels: application: kamaji-etcd diff --git a/docs/defragmentation.md b/docs/defragmentation.md new file mode 100644 index 0000000..7193e44 --- /dev/null +++ b/docs/defragmentation.md @@ -0,0 +1,49 @@ +# Defragmenting Data +For dense Kubernetes clusters, `etcd` can suffer from poor performance if the keyspace grows too large and exceeds the space quota. Periodically maintain and defragment `etcd` to free up space in the data store. See details [here](https://etcd.io/docs/v3.5/op-guide/maintenance/). + +Monitor Prometheus for `etcd` metrics and defragment it when required, otherwise, `etcd` can raise a cluster-wide alarm that puts the cluster into a maintenance mode accepting only key reads and deletes. + +To keep track of defragmentation requirements, monitor these key metrics: + +- `etcd_server_quota_backend_bytes`: which is the current quota limit +- `etcd_mvcc_db_total_size_in_use_in_bytes`: which indicates the actual database usage after a history compaction +- `etcd_mvcc_db_total_size_in_bytes`, which shows the database size, including free space waiting for defragmentation + +You can also determine whether defragmentation is needed by checking the `etcd` database size in MB that will be freed by defragmentation with the PromQL expression: + +- `(etcd_mvcc_db_total_size_in_bytes - etcd_mvcc_db_total_size_in_use_in_bytes)/1024/1024` + +Defragmentation is an expensive operation, so it should be executed as infrequently as possible. On the other hand, it's also necessary to make sure any `etcd` member will not exceed the storage quota. The Kubernetes project recommends that when you perform defragmentation, you use a tool such as [etcd-defrag](https://github.com/ahrtr/etcd-defrag). + +The `defrag.sh` script is designed to create and schedule jobs for periodically defragment data on a `kamaji-etcd` instance. The script generates Kubernetes CronJob manifests and applies them to the specified namespace. Make sure you set the defragmentation criteria according to your environment needs. + + +## Usage +To run the script, use the following command: + +```bash +./defrag.sh [-e etcd_name] [-s etcd_service] [-n etcd_namespace] [-j schedule] +``` + +## Parameters + +- `-e etcd_name`: Name of the etcd StatefulSet (default: `kamaji-etcd`) +- `-s etcd_service`: Name of the etcd service (default: `kamaji-etcd`) +- `-n etcd_namespace`: Namespace of the etcd StatefulSet (default: `kamaji-system`) +- `-j schedule`: Cron schedule for the defrag job (default: `"0 0 * * *"`, which means daily at midnight) + +## Example + +To run the script with custom parameters: + +```bash +./defrag.sh -e kamaji-etcd -s kamaji-etcd -n kamaji-system -j "14 9 * * 1-5" +``` +This will create a Kubernetes CronJob manifest with the specified parameters and apply it to the cluster. + +## Debug mode +To run the script in debug mode set the environment variable `DEBUG`: + +``` bash +export DEBUG=1 +``` diff --git a/scripts/defrag.sh b/scripts/defrag.sh new file mode 100755 index 0000000..ff8352b --- /dev/null +++ b/scripts/defrag.sh @@ -0,0 +1,83 @@ +#!/bin/bash + +# Enable debugging, exit on errors, and ensure the script fails if any command in a pipeline fails +if [ "${DEBUG}" = 1 ]; then + set -x +fi +set -eu -o pipefail + +# Default values for the parameters +ETCD_NAME="kamaji-etcd" +ETCD_SERVICE="kamaji-etcd" +ETCD_NAMESPACE="kamaji-system" +SCHEDULE="0 0 * * *" # every day at midnight + +# Parse script parameters +while getopts "e:s:n:j:" opt; do + case ${opt} in + e ) ETCD_NAME=$OPTARG ;; + s ) ETCD_SERVICE=$OPTARG ;; + n ) ETCD_NAMESPACE=$OPTARG ;; + j ) SCHEDULE=$OPTARG ;; + \? ) echo "Usage: ./defrag.sh [-e etcd_name] [-s etcd_service] [-n etcd_namespace] [-j schedule]" + exit 1 ;; + esac +done + +# Function to create the CronJob manifest for defrag etcd +create_defrag_cronjob() { + local etcd_name=$1 + local etcd_service=$2 + local etcd_namespace=$3 + local schedule=$4 # Add a parameter for the cron schedule + + cat < ${etcd_name}-defrag-job.yaml +apiVersion: batch/v1 +kind: CronJob +metadata: + name: ${etcd_name}-defrag-job + namespace: $etcd_namespace +spec: + schedule: "$schedule" # Use the provided schedule + jobTemplate: + spec: + template: + spec: + containers: + - name: etcd-defrag + image: ghcr.io/ahrtr/etcd-defrag:v0.15.0 # Please replace the version with the latest version. + args: + - --endpoints=https://${etcd_name}-0.${etcd_service}.${etcd_namespace}.svc.cluster.local:2379,https://${etcd_name}-1.${etcd_service}.${etcd_namespace}.svc.cluster.local:2379,https://${etcd_name}-2.${etcd_service}.${etcd_namespace}.svc.cluster.local:2379 + - --cacert=/opt/certs/ca/ca.crt + - --cert=/opt/certs/root-client-certs/tls.crt + - --key=/opt/certs/root-client-certs/tls.key + - --cluster + - --defrag-rule + - "dbQuotaUsage > 0.8 || dbSize - dbSizeInUse > 200*1024*1024" + volumeMounts: + - mountPath: /opt/certs/root-client-certs + name: root-client-certs + - mountPath: /opt/certs/ca + name: certs + restartPolicy: OnFailure + securityContext: + runAsUser: 0 + volumes: + - name: root-client-certs + secret: + secretName: ${etcd_name}-root-client-certs + - name: certs + secret: + secretName: ${etcd_name}-certs +EOF +} + +# Main script to defrag etcd +main() { + # Create and apply defrag CronJob + create_defrag_cronjob "$ETCD_NAME" "$ETCD_SERVICE" "$ETCD_NAMESPACE" "$SCHEDULE" + kubectl apply -f $ETCD_NAME-defrag-job.yaml +} + +# Execute the main script +main \ No newline at end of file