Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

K8SPS-288 - async self healing test #428

Draft
wants to merge 13 commits into
base: main
Choose a base branch
from
2 changes: 1 addition & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,7 @@ void prepareNode() {
# v0.15.0 kuttl version
kubectl krew install --manifest-url https://raw.githubusercontent.com/kubernetes-sigs/krew-index/a67f31ecb2e62f15149ca66d096357050f07b77d/plugins/kuttl.yaml
printf "%s is installed" "$(kubectl kuttl --version)"
kubectl krew install assert
'''
}

Expand All @@ -287,7 +288,6 @@ pipeline {
environment {
CLOUDSDK_CORE_DISABLE_PROMPTS = 1
CLEAN_NAMESPACE = 1
OPERATOR_NS = 'ps-operator'
GIT_SHORT_COMMIT = sh(script: 'git rev-parse --short HEAD', , returnStdout: true).trim()
VERSION = "${env.GIT_BRANCH}-${env.GIT_SHORT_COMMIT}"
CLUSTER_NAME = sh(script: "echo jen-ps-${env.CHANGE_ID}-${GIT_SHORT_COMMIT}-${env.BUILD_NUMBER} | tr '[:upper:]' '[:lower:]'", , returnStdout: true).trim()
Expand Down
15 changes: 15 additions & 0 deletions e2e-tests/conf/chaos-network-loss.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
apiVersion: chaos-mesh.org/v1alpha1
kind: NetworkChaos
metadata:
name: network-loss-example
spec:
action: loss
mode: one
selector:
pods:
test-namespace:
- pod-name
loss:
loss: "100"
correlation: "100"
duration: "60s"
13 changes: 13 additions & 0 deletions e2e-tests/conf/chaos-pod-failure.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
apiVersion: chaos-mesh.org/v1alpha1
kind: PodChaos
metadata:
name: pod-failure-example
spec:
action: pod-failure
mode: one
value: ""
duration: "60s"
selector:
pods:
test-namespace:
- pod-name
11 changes: 11 additions & 0 deletions e2e-tests/conf/chaos-pod-kill.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
apiVersion: chaos-mesh.org/v1alpha1
kind: PodChaos
metadata:
name: pod-kill-example
spec:
action: pod-kill
mode: one
selector:
pods:
test-namespace:
- pod-name
144 changes: 123 additions & 21 deletions e2e-tests/functions
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ deploy_pmm_server() {
--set platform="${platform}" \
"https://percona-charts.storage.googleapis.com/pmm-server-${PMM_SERVER_VERSION}.tgz"
fi
SERVICE="postgres"
local SERVICE="postgres"
until kubectl -n "${NAMESPACE}" exec monitoring-0 -- bash -c "pgrep -x $SERVICE >/dev/null"; do
echo "Retry $retry"
sleep 5
Expand All @@ -63,13 +63,13 @@ deploy_pmm_server() {
}

get_pmm_api_key() {
ADMIN_PASSWORD=$(kubectl -n "${NAMESPACE}" exec monitoring-0 -- bash -c "printenv | grep ADMIN_PASSWORD | cut -d '=' -f2")
local ADMIN_PASSWORD=$(kubectl -n "${NAMESPACE}" exec monitoring-0 -- bash -c "printenv | grep ADMIN_PASSWORD | cut -d '=' -f2")
echo $(curl --insecure -X POST -H "Content-Type: application/json" -d '{"name":"operator", "role": "Admin"}' "https://admin:$ADMIN_PASSWORD@"$(get_service_ip monitoring-service)"/graph/api/auth/keys" | jq .key)
}

deploy_minio() {
accessKey="$(kubectl -n "${NAMESPACE}" get secret minio-secret -o jsonpath='{.data.AWS_ACCESS_KEY_ID}' | base64 -d)"
secretKey="$(kubectl -n "${NAMESPACE}" get secret minio-secret -o jsonpath='{.data.AWS_SECRET_ACCESS_KEY}' | base64 -d)"
local accessKey="$(kubectl -n "${NAMESPACE}" get secret minio-secret -o jsonpath='{.data.AWS_ACCESS_KEY_ID}' | base64 -d)"
local secretKey="$(kubectl -n "${NAMESPACE}" get secret minio-secret -o jsonpath='{.data.AWS_SECRET_ACCESS_KEY}' | base64 -d)"

helm uninstall -n "${NAMESPACE}" minio-service || :
helm repo remove minio || :
Expand Down Expand Up @@ -299,6 +299,7 @@ get_mysql_users() {

get_service_ip() {
local service=$1

while (kubectl get service/$service -n "${NAMESPACE}" -o 'jsonpath={.spec.type}' 2>&1 || :) | grep -q NotFound; do
sleep 1
done
Expand Down Expand Up @@ -379,16 +380,43 @@ wait_pod() {
set -o xtrace
}

wait_deployment() {
local name=$1
local target_namespace=${2:-"$namespace"}

sleep 10
set +o xtrace
retry=0
echo -n $name
until [ -n "$(kubectl -n ${target_namespace} get deployment $name -o jsonpath='{.status.replicas}')" \
-a "$(kubectl -n ${target_namespace} get deployment $name -o jsonpath='{.status.replicas}')" \
== "$(kubectl -n ${target_namespace} get deployment $name -o jsonpath='{.status.readyReplicas}')" ]; do
sleep 1
echo -n .
let retry+=1
if [ $retry -ge 360 ]; then
kubectl logs $(get_operator_pod) -c operator \
| grep -v 'level=info' \
| grep -v 'level=debug' \
| tail -100
echo max retry count $retry reached. something went wrong with operator or kubernetes cluster
exit 1
fi
done
echo
set -o xtrace
}

check_auto_tuning() {
RAM_SIZE=$1
RDS_MEM_INSTANCE=12582880
CUSTOM_INNODB_SIZE=$2
CUSTOM_CONNECTIONS=$3
local RAM_SIZE=$1
local RDS_MEM_INSTANCE=12582880
local CUSTOM_INNODB_SIZE=$2
local CUSTOM_CONNECTIONS=$3

INNODB_SIZE=$(run_mysql \
local INNODB_SIZE=$(run_mysql \
'SELECT @@innodb_buffer_pool_size;' \
"-h $(get_haproxy_svc "$(get_cluster_name)") -uroot -proot_password")
CONNECTIONS=$(run_mysql \
local CONNECTIONS=$(run_mysql \
'SELECT @@max_connections;' \
"-h $(get_haproxy_svc "$(get_cluster_name)") -uroot -proot_password")

Expand Down Expand Up @@ -448,12 +476,15 @@ get_primary_from_haproxy() {
run_mysql "SHOW VARIABLES LIKE '%hostname%';" "-h ${haproxy_pod_ip} -P3306 -uroot -proot_password" | awk '{print $2}'
}

get_primary_from_group_replication() {
run_mysql "SELECT MEMBER_HOST FROM performance_schema.replication_group_members where MEMBER_ROLE='PRIMARY';" "-h $(get_mysql_router_service $(get_cluster_name)) -P 6446 -uroot -proot_password" | cut -d'.' -f1
}

verify_certificate_sans() {
local certificate=$1
local expected_sans=$2

have=$(mktemp)
want=$(mktemp)
local have=$(mktemp)
local want=$(mktemp)

kubectl -n "${NAMESPACE}" get certificate "${certificate}" -o jsonpath='{.spec.dnsNames}' | jq '.' >"${have}"
echo "${expected_sans}" | jq '.' >"${want}"
Expand All @@ -462,21 +493,19 @@ verify_certificate_sans() {
}

check_passwords_leak() {

secrets=$(kubectl get secrets -o json | jq -r '.items[].data | to_entries | .[] | select(.key | (endswith(".crt") or endswith(".key") or endswith(".pub") or endswith(".pem") or endswith(".p12")) | not) | .value')

passwords="$(for i in $secrets; do base64 -d <<< $i; echo; done) $secrets"
pods=$(kubectl -n "${NAMESPACE}" get pods -o name | awk -F "/" '{print $2}')
local secrets=$(kubectl get secrets -o json | jq -r '.items[].data | to_entries | .[] | select(.key | (endswith(".crt") or endswith(".key") or endswith(".pub") or endswith(".pem") or endswith(".p12")) | not) | .value')
local passwords="$(for i in $secrets; do base64 -d <<< $i; echo; done) $secrets"
inelpandzic marked this conversation as resolved.
Show resolved Hide resolved
local pods=$(kubectl -n "${NAMESPACE}" get pods -o name | awk -F "/" '{print $2}')

collect_logs() {
NS=$1
for p in $pods; do
containers=$(kubectl -n "$NS" get pod $p -o jsonpath='{.spec.containers[*].name}')
local containers=$(kubectl -n "$NS" get pod $p -o jsonpath='{.spec.containers[*].name}')
for c in $containers; do
kubectl -n "$NS" logs $p -c $c >${TEMP_DIR}/logs_output-$p-$c.txt
echo logs saved in: ${TEMP_DIR}/logs_output-$p-$c.txt
for pass in $passwords; do
count=$(grep -c --fixed-strings -- "$pass" ${TEMP_DIR}/logs_output-$p-$c.txt || :)
local count=$(grep -c --fixed-strings -- "$pass" ${TEMP_DIR}/logs_output-$p-$c.txt || :)
if [[ $count != 0 ]]; then
echo leaked passwords are found in log ${TEMP_DIR}/logs_output-$p-$c.txt
false
Expand All @@ -489,7 +518,80 @@ check_passwords_leak() {

collect_logs $NAMESPACE
if [ -n "$OPERATOR_NS" ]; then
pods=$(kubectl -n "${OPERATOR_NS}" get pods -o name | awk -F "/" '{print $2}')
local pods=$(kubectl -n "${OPERATOR_NS}" get pods -o name | awk -F "/" '{print $2}')
collect_logs $OPERATOR_NS
fi
}

deploy_chaos_mesh() {
destroy_chaos_mesh

helm repo add chaos-mesh https://charts.chaos-mesh.org
helm install chaos-mesh chaos-mesh/chaos-mesh --namespace=${NAMESPACE} --set chaosDaemon.runtime=containerd --set chaosDaemon.socketPath=/run/containerd/containerd.sock --set dashboard.create=false --version 2.5.1
sleep 10
}

destroy_chaos_mesh() {
local chaos_mesh_ns=$(helm list --all-namespaces --filter chaos-mesh | tail -n1 | awk -F' ' '{print $2}' | sed 's/NAMESPACE//')

for i in $(kubectl api-resources | grep chaos-mesh | awk '{print $1}'); do timeout 30 kubectl delete ${i} --all --all-namespaces || :; done
if [ -n "${chaos_mesh_ns}" ]; then
helm uninstall chaos-mesh --namespace ${chaos_mesh_ns} || :
fi
timeout 30 kubectl delete crd $(kubectl get crd | grep 'chaos-mesh.org' | awk '{print $1}') || :
timeout 30 kubectl delete clusterrolebinding $(kubectl get clusterrolebinding | grep 'chaos-mesh' | awk '{print $1}') || :
timeout 30 kubectl delete clusterrole $(kubectl get clusterrole | grep 'chaos-mesh' | awk '{print $1}') || :
timeout 30 kubectl delete MutatingWebhookConfiguration $(kubectl get MutatingWebhookConfiguration | grep 'chaos-mesh' | awk '{print $1}') || :
timeout 30 kubectl delete ValidatingWebhookConfiguration $(kubectl get ValidatingWebhookConfiguration | grep 'chaos-mesh' | awk '{print $1}') || :
timeout 30 kubectl delete ValidatingWebhookConfiguration $(kubectl get ValidatingWebhookConfiguration | grep 'validate-auth' | awk '{print $1}') || :
}

kill_pods() {
local ns=$1
local selector=$2
local pod_label=$3
local label_value=$4
local chaos_suffix=$5

if [ "${selector}" == "pod" ]; then
yq eval '
.metadata.name = "chaos-pod-kill-'${chaos_suffix}'" |
del(.spec.selector.pods.test-namespace) |
.spec.selector.pods.'${ns}'[0] = "'${pod_label}'"' ${TESTS_CONFIG_DIR}/chaos-pod-kill.yml \
| kubectl apply --namespace ${ns} -f -
elif [ "${selector}" == "label" ]; then
yq eval '
.metadata.name = "chaos-kill-label-'${chaos_suffix}'" |
.spec.mode = "all" |
del(.spec.selector.pods) |
.spec.selector.labelSelectors."'${pod_label}'" = "'${label_value}'"' ${TESTS_CONFIG_DIR}/chaos-pod-kill.yml \
| kubectl apply --namespace ${ns} -f -
fi
sleep 5
}

failure_pod() {
local ns=$1
local pod=$2
local chaos_suffix=$3

yq eval '
.metadata.name = "chaos-pod-failure-'${chaos_suffix}'" |
del(.spec.selector.pods.test-namespace) |
.spec.selector.pods.'${ns}'[0] = "'${pod}'"' ${TESTS_CONFIG_DIR}/chaos-pod-failure.yml \
| kubectl apply --namespace ${ns} -f -
sleep 5
}

network_loss() {
local ns=$1
local pod=$2
local chaos_suffix=$3

yq eval '
.metadata.name = "chaos-pod-network-loss-'${chaos_suffix}'" |
del(.spec.selector.pods.test-namespace) |
.spec.selector.pods.'${ns}'[0] = "'${pod}'"' ${TESTS_CONFIG_DIR}/chaos-network-loss.yml \
| kubectl apply --namespace ${ns} -f -
sleep 5
}
3 changes: 3 additions & 0 deletions e2e-tests/run-distro.csv
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,16 @@ gr-haproxy
gr-init-deploy
gr-one-pod
gr-scaling
gr-self-healing
gr-tls-cert-manager
gr-users
haproxy
init-deploy
monitoring
one-pod
operator-self-healing
scaling
self-healing
service-per-pod
sidecars
smart-update
Expand Down
3 changes: 3 additions & 0 deletions e2e-tests/run-minikube.csv
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,14 @@ gr-haproxy
gr-init-deploy
gr-one-pod
gr-scaling
gr-self-healing
gr-tls-cert-manager
gr-users
haproxy
init-deploy
one-pod
operator-self-healing
self-healing
sidecars
smart-update
tls-cert-manager
Expand Down
3 changes: 3 additions & 0 deletions e2e-tests/run-pr.csv
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,17 @@ gr-ignore-annotations
gr-init-deploy
gr-one-pod
gr-scaling
gr-self-healing
gr-tls-cert-manager
gr-users
haproxy
init-deploy
limits
monitoring
one-pod
operator-self-healing
scaling
self-healing
service-per-pod
sidecars
smart-update
Expand Down
3 changes: 3 additions & 0 deletions e2e-tests/run-release.csv
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,17 @@ gr-ignore-annotations
gr-init-deploy
gr-one-pod
gr-scaling
gr-self-healing
gr-tls-cert-manager
gr-users
haproxy
init-deploy
limits
monitoring
one-pod
operator-self-healing
scaling
self-healing
service-per-pod
sidecars
smart-update
Expand Down
26 changes: 26 additions & 0 deletions e2e-tests/tests/gr-self-healing/00-assert.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
apiVersion: kuttl.dev/v1beta1
kind: TestAssert
timeout: 120
---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
name: perconaservermysqls.ps.percona.com
spec:
group: ps.percona.com
names:
kind: PerconaServerMySQL
listKind: PerconaServerMySQLList
plural: perconaservermysqls
shortNames:
- ps
singular: perconaservermysql
scope: Namespaced
---
apiVersion: kuttl.dev/v1beta1
kind: TestAssert
metadata:
name: check-operator-deploy-status
timeout: 120
commands:
- script: kubectl assert exist-enhanced deployment percona-server-mysql-operator -n ${OPERATOR_NS:-$NAMESPACE} --field-selector status.readyReplicas=1
14 changes: 14 additions & 0 deletions e2e-tests/tests/gr-self-healing/00-deploy-operator.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
apiVersion: kuttl.dev/v1beta1
kind: TestStep
timeout: 10
commands:
- script: |-
set -o errexit
set -o xtrace

source ../../functions

deploy_operator
deploy_non_tls_cluster_secrets
deploy_tls_cluster_secrets
deploy_client
Loading
Loading