Skip to content

Commit

Permalink
Implement Cilium watchdog
Browse files Browse the repository at this point in the history
  • Loading branch information
jingyuanliang committed Feb 27, 2024
1 parent 99b006f commit a56ea76
Show file tree
Hide file tree
Showing 8 changed files with 455 additions and 20 deletions.
90 changes: 74 additions & 16 deletions scripts/install-cni.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ calico_ready() {
compgen -G "/host/etc/cni/net.d/*calico*.conflist"
}

# shellcheck disable=SC2317 # when called with $1=cni_ready
cni_ready() {
local -r cni_bin="$1"
echo "Running '/host/home/kubernetes/bin/${cni_bin}' with CNI_COMMAND=VERSION"
Expand All @@ -34,11 +35,11 @@ cni_ready() {
}

# inotify callback
if [ -n "$1" ]; then
if [[ -n "$1" ]]; then
# We run into this branch at callback from inotify. In this case, call the
# specified function then exit. The return value from that function (exit
# status of the last command in the function) is used as the exit status.
# "$@" would be like "calico_ready" or "calico_ready" "cilium-cni".
# "$@" would be like "calico_ready" or "cni_ready" "cilium-cni".
"$@"
exit
fi
Expand Down Expand Up @@ -230,18 +231,6 @@ if [ "${ENABLE_CILIUM_PLUGIN}" == "true" ]; then
# inotify calls back to the beginning of this script.
inotify /host/home/kubernetes/bin cilium-cni "$0" cni_ready cilium-cni
echo "Cilium plug-in binary is now confirmed as ready."

HEALTHZ_PORT="${CILIUM_HEALTHZ_PORT:-9879}"
RETRY_MAX_TIME="${CILIUM_HEALTH_MAX_WAIT_TIME:-600}"
# Wait upto the specified time for the cilium pod to report healthy.
if curl -fsSm 1 --retry "${RETRY_MAX_TIME}" --retry-all-errors \
--retry-max-time "${RETRY_MAX_TIME}" --retry-delay 1 \
-o /dev/null --stderr - \
http://localhost:"${HEALTHZ_PORT}"/healthz; then
echo "Cilium healthz reported success."
else
echo "Cilium not yet ready. Continuing anyway."
fi
fi

# Wait for istio plug-in if it is enabled
Expand Down Expand Up @@ -273,5 +262,74 @@ function write_file {

# Output CNI spec (template).
output_file=${CALICO_CNI_SPEC_TEMPLATE_FILE:-/host/etc/cni/net.d/${CNI_SPEC_NAME}}
echo "Creating CNI spec at '${output_file}' with content: $(jq -c . <<<"${cni_spec}")"
write_file "${output_file}" "${cni_spec}"

# Wait up to the specified time for the cilium pod to report healthy.
cilium_health_check() {
local retry_max_time=$1
local healthz_port=${2:-${CILIUM_HEALTHZ_PORT:-9879}}

curl -fsSm 1 --retry "${retry_max_time}" --retry-all-errors \
--retry-max-time "${retry_max_time}" --retry-delay 1 \
-o /dev/null --stderr - \
http://localhost:"${healthz_port}"/healthz
}

# Try to decouple RUN_CNI_WATCHDOG and ENABLE_CILIUM_PLUGIN; don't assume
# ENABLE_CILIUM_PLUGIN is set whenever RUN_CNI_WATCHDOG is set.
if [[ "${RUN_CNI_WATCHDOG:-}" != "true" ]]; then

# In non-watchdog mode, we must exit after writing CNI config.
echo "Not running CNI watchdog. Will exit as soon as CNI config is written."

if [[ "${ENABLE_CILIUM_PLUGIN:-}" == "true" ]]; then
if cilium_health_check "${CILIUM_HEALTH_MAX_WAIT_TIME:-600}"; then
echo "Cilium healthz reported success."
else
echo "Cilium not yet ready. Continuing anyway."
fi
fi

echo "Creating CNI spec at '${output_file}' with content: $(jq -c . <<<"${cni_spec}")"
write_file "${output_file}" "${cni_spec}"

exit 0
fi

# In watchdog mode, we should write CNI config but never exit.
if [[ "${ENABLE_CILIUM_PLUGIN:-}" != "true" ]]; then
echo "Running CNI watchdog, but there is no Cilium to watch."

echo "Creating CNI spec at '${output_file}' with content: $(jq -c . <<<"${cni_spec}")"
write_file "${output_file}" "${cni_spec}"

while true; do
echo "Sleeping infinity now."
sleep infinity
done
# In case of anything unexpected, don't fallthrough to the logic below.
exit 1
fi

echo "Running CNI watchdog to watch Cilium and manage CNI config at '${output_file}' with content: $(jq -c . <<<"${cni_spec}")"
cilium_watchdog_success_wait=${CILIUM_WATCHDOG_SUCCESS_WAIT:-300}
cilium_watchdog_failure_retry=${CILIUM_WATCHDOG_FAILURE_RETRY:-60}

if [[ -n "${CILIUM_FAST_START_NAMESPACES:-}" ]]; then
echo "Cilium has fast-start; writing CNI config upfront then start to check Cilium health."
write_file "${output_file}" "${cni_spec}"
fi

while true; do
echo "Checking Cilium health allowing retries for up to ${cilium_watchdog_failure_retry}s."
if cilium_health_check "${cilium_watchdog_failure_retry}"; then
echo "Cilium healthz reported success; writing CNI config if not already there then wait for ${cilium_watchdog_success_wait}s."
[[ ! -f "${output_file}" ]] && write_file "${output_file}" "${cni_spec}"
sleep "${cilium_watchdog_success_wait}"s
else
echo "Cilium does not appear healthy; removing CNI config if it exists."
rm -f -- "${output_file}"
fi
done

# In case of anything unexpected, signal failure.
exit 1
3 changes: 3 additions & 0 deletions scripts/shell-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,9 @@ echo >/netd-test && rm /netd-test && [[ ! -f /netd-test ]] && pass || fail
run_test timeout_cmd
timeout 2s sleep 1s && pass || fail

run_test sleep_infinity_cmd
timeout 1s sleep infinity && fail || { [[ "$?" == 124 ]] && pass || fail; }

run_test base64_cmd
[[ "$(echo -n AAA | base64 -w 0)" == QUFB ]] && pass || fail

Expand Down
23 changes: 19 additions & 4 deletions scripts/test-install-cni.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,14 @@ Destination Gateway Genmask Flags Metric Ref Use Iface
}
export -f timeout

# shellcheck disable=SC2317
function sleep() {
echo "[MOCK called] sleep $*"
echo "[MOCK] sleep shouldn't be called during normal execution; exiting with ${TEST_EXIT_CODE_SLEEP} as a signal."
exit "${TEST_EXIT_CODE_SLEEP}"
}
export -f sleep

function before_test() {
echo "no custom init defined for testcase ${testcase}; define custom mocks in before_test() function as needed"
}
Expand All @@ -55,6 +63,8 @@ function cleanup_envs() {
CILIUM_FAST_START_NAMESPACES \
CILIUM_HEALTHZ_PORT \
CILIUM_HEALTH_MAX_WAIT_TIME \
CILIUM_WATCHDOG_FAILURE_RETRY \
CILIUM_WATCHDOG_SUCCESS_WAIT \
CNI_SPEC_IPV6_ROUTE \
CNI_SPEC_TEMPLATE \
ENABLE_BANDWIDTH_PLUGIN \
Expand All @@ -65,10 +75,12 @@ function cleanup_envs() {
ISTIO_CNI_CONFIG \
MIGRATE_TO_DPV2 DPV2_MIGRATION_READY \
RETRY_MAX_TIME \
RUN_CNI_WATCHDOG \
STACK_TYPE \
WRITE_CALICO_CONFIG_FILE
}

export TEST_EXIT_CODE_SLEEP=42

FAIL_COUNT=0

Expand All @@ -77,7 +89,7 @@ run_test() {
}

pass() {
echo " PASS"
echo " PASS [$*]"
}

fail() {
Expand All @@ -97,6 +109,9 @@ for testcase in testcase/testcase-*.sh ; do
# resetting envs
cleanup_envs

# allow being overridden in testcase
TEST_WANT_EXIT_CODE=0

# setting CNI_SPEC_NAME to testcase name (filename in test.out/)
CNI_SPEC_NAME="${testcase%.sh}"
export CNI_SPEC_NAME="${CNI_SPEC_NAME##*/}"
Expand All @@ -110,14 +125,14 @@ for testcase in testcase/testcase-*.sh ; do
# running install-cni script
./install-cni.sh >>test.log 2>&1
exit_code="$?"
if [ "0" != "${exit_code}" ] ; then
if [ "${TEST_WANT_EXIT_CODE}" != "${exit_code}" ] ; then
# script exited with non-zero code
fail "non-zero exit code ($exit_code)"
fail "unexpected exit code ($exit_code) want (${TEST_WANT_EXIT_CODE})"
# running testcase verification
elif ! verify ; then
fail "verification failure"
else
pass
pass "${exit_code}"
fi

done
Expand Down
72 changes: 72 additions & 0 deletions scripts/testcase/testcase-watchdog-cilium-faststart-unhealthy.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
export KUBERNETES_SERVICE_HOST=kubernetes.default.svc
export KUBERNETES_SERVICE_PORT=443

export ENABLE_CALICO_NETWORK_POLICY=false
export ENABLE_BANDWIDTH_PLUGIN=false
export ENABLE_CILIUM_PLUGIN=true
export CILIUM_HEALTHZ_PORT=63197
export CILIUM_FAST_START_NAMESPACES=default,kube-system
export ENABLE_MASQUERADE=false
export ENABLE_IPV6=false
export RUN_CNI_WATCHDOG=true

CNI_SPEC_TEMPLATE=$(cat testdata/spec-template.json)
export CNI_SPEC_TEMPLATE

export TEST_WANT_EXIT_CODE=24

function before_test() {

function curl() {
# shellcheck disable=SC2317
case "$*" in
*http://metadata.google.internal/computeMetadata/v1/instance/network-interfaces/0*)
echo '{"ipv6s": ["2600:1900:4000:318:0:7:0:0"]}'
;;
*https://kubernetes.default.svc:443/api/v1/nodes/*)
echo '{
"metadata": {
"labels": {
},
"creationTimestamp": "2024-01-03T11:54:01Z",
"name": "gke-my-cluster-default-pool-128bc25d-9c94",
"resourceVersion": "891003",
"uid": "f2353a2f-ca8c-4ca0-8dd3-ad1f964a54f0"
},
"spec": {
"podCIDR": "10.52.1.0/24",
"podCIDRs": [
"10.52.1.0/24"
],
"providerID": "gce://my-gke-project/us-central1-c/gke-my-cluster-default-pool-128bc25d-9c94"
}
}'
;;
*http://localhost:63197/*)
# Return unhealthy on the first attempt, then exit on the following.
if [[ "${TEST_CILIUM_HEALTH_CHECKED:-}" == "true" ]]; then
exit "${TEST_WANT_EXIT_CODE}"
fi
TEST_CILIUM_HEALTH_CHECKED=true
return 1
;;
*)
#unsupported
exit 1
esac
}
export -f curl

}

function verify() {
local actual

if [[ -f "/host/etc/cni/net.d/${CNI_SPEC_NAME}" ]]; then
actual=$(jq -S . <"/host/etc/cni/net.d/${CNI_SPEC_NAME}")
echo "Expected CNI config to be missing, but it has:"
echo "$actual"
return 1
fi

}
74 changes: 74 additions & 0 deletions scripts/testcase/testcase-watchdog-cilium-faststart.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
export KUBERNETES_SERVICE_HOST=kubernetes.default.svc
export KUBERNETES_SERVICE_PORT=443

export ENABLE_CALICO_NETWORK_POLICY=false
export ENABLE_BANDWIDTH_PLUGIN=false
export ENABLE_CILIUM_PLUGIN=true
export CILIUM_HEALTHZ_PORT=63197
export CILIUM_FAST_START_NAMESPACES=default,kube-system
export ENABLE_MASQUERADE=false
export ENABLE_IPV6=false
export RUN_CNI_WATCHDOG=true

CNI_SPEC_TEMPLATE=$(cat testdata/spec-template.json)
export CNI_SPEC_TEMPLATE

export TEST_WANT_EXIT_CODE=24

function before_test() {

function curl() {
# shellcheck disable=SC2317
case "$*" in
*http://metadata.google.internal/computeMetadata/v1/instance/network-interfaces/0*)
echo '{"ipv6s": ["2600:1900:4000:318:0:7:0:0"]}'
;;
*https://kubernetes.default.svc:443/api/v1/nodes/*)
echo '{
"metadata": {
"labels": {
},
"creationTimestamp": "2024-01-03T11:54:01Z",
"name": "gke-my-cluster-default-pool-128bc25d-9c94",
"resourceVersion": "891003",
"uid": "f2353a2f-ca8c-4ca0-8dd3-ad1f964a54f0"
},
"spec": {
"podCIDR": "10.52.1.0/24",
"podCIDRs": [
"10.52.1.0/24"
],
"providerID": "gce://my-gke-project/us-central1-c/gke-my-cluster-default-pool-128bc25d-9c94"
}
}'
;;
*http://localhost:63197/*)
# With fast-start enabled, CNI config should have been written
# at the first Cilium health check attempt.
exit "${TEST_WANT_EXIT_CODE}"
;;
*)
#unsupported
exit 1
esac
}
export -f curl

}

function verify() {
local expected
local actual

expected=$(jq -S . <"testdata/expected-cilium-faststart.json")
actual=$(jq -S . <"/host/etc/cni/net.d/${CNI_SPEC_NAME}")

if [ "$expected" != "$actual" ] ; then
echo "Expected cni_spec value:"
echo "$expected"
echo "but actual was"
echo "$actual"
return 1
fi

}
Loading

0 comments on commit a56ea76

Please sign in to comment.