diff --git a/scripts/install-cni.sh b/scripts/install-cni.sh index 5d6adab6a..37f240aa5 100755 --- a/scripts/install-cni.sh +++ b/scripts/install-cni.sh @@ -21,6 +21,7 @@ calico_ready() { compgen -G "/host/etc/cni/net.d/*calico*.conflist" } +# shellcheck disable=SC2317 # when called with $1=cni_ready cni_ready() { local -r cni_bin="$1" echo "Running '/host/home/kubernetes/bin/${cni_bin}' with CNI_COMMAND=VERSION" @@ -34,11 +35,11 @@ cni_ready() { } # inotify callback -if [ -n "$1" ]; then +if [[ -n "$1" ]]; then # We run into this branch at callback from inotify. In this case, call the # specified function then exit. The return value from that function (exit # status of the last command in the function) is used as the exit status. - # "$@" would be like "calico_ready" or "calico_ready" "cilium-cni". + # "$@" would be like "calico_ready" or "cni_ready" "cilium-cni". "$@" exit fi @@ -230,18 +231,6 @@ if [ "${ENABLE_CILIUM_PLUGIN}" == "true" ]; then # inotify calls back to the beginning of this script. inotify /host/home/kubernetes/bin cilium-cni "$0" cni_ready cilium-cni echo "Cilium plug-in binary is now confirmed as ready." - - HEALTHZ_PORT="${CILIUM_HEALTHZ_PORT:-9879}" - RETRY_MAX_TIME="${CILIUM_HEALTH_MAX_WAIT_TIME:-600}" - # Wait upto the specified time for the cilium pod to report healthy. - if curl -fsSm 1 --retry "${RETRY_MAX_TIME}" --retry-all-errors \ - --retry-max-time "${RETRY_MAX_TIME}" --retry-delay 1 \ - -o /dev/null --stderr - \ - http://localhost:"${HEALTHZ_PORT}"/healthz; then - echo "Cilium healthz reported success." - else - echo "Cilium not yet ready. Continuing anyway." - fi fi # Wait for istio plug-in if it is enabled @@ -273,5 +262,74 @@ function write_file { # Output CNI spec (template). output_file=${CALICO_CNI_SPEC_TEMPLATE_FILE:-/host/etc/cni/net.d/${CNI_SPEC_NAME}} -echo "Creating CNI spec at '${output_file}' with content: $(jq -c . <<<"${cni_spec}")" -write_file "${output_file}" "${cni_spec}" + +# Wait up to the specified time for the cilium pod to report healthy. +cilium_health_check() { + local retry_max_time=$1 + local healthz_port=${2:-${CILIUM_HEALTHZ_PORT:-9879}} + + curl -fsSm 1 --retry "${retry_max_time}" --retry-all-errors \ + --retry-max-time "${retry_max_time}" --retry-delay 1 \ + -o /dev/null --stderr - \ + http://localhost:"${healthz_port}"/healthz +} + +# Try to decouple RUN_CNI_WATCHDOG and ENABLE_CILIUM_PLUGIN; don't assume +# ENABLE_CILIUM_PLUGIN is set whenever RUN_CNI_WATCHDOG is set. +if [[ "${RUN_CNI_WATCHDOG:-}" != "true" ]]; then + + # In non-watchdog mode, we must exit after writing CNI config. + echo "Not running CNI watchdog. Will exit as soon as CNI config is written." + + if [[ "${ENABLE_CILIUM_PLUGIN:-}" == "true" ]]; then + if cilium_health_check "${CILIUM_HEALTH_MAX_WAIT_TIME:-600}"; then + echo "Cilium healthz reported success." + else + echo "Cilium not yet ready. Continuing anyway." + fi + fi + + echo "Creating CNI spec at '${output_file}' with content: $(jq -c . <<<"${cni_spec}")" + write_file "${output_file}" "${cni_spec}" + + exit 0 +fi + +# In watchdog mode, we should write CNI config but never exit. +if [[ "${ENABLE_CILIUM_PLUGIN:-}" != "true" ]]; then + echo "Running CNI watchdog, but there is no Cilium to watch." + + echo "Creating CNI spec at '${output_file}' with content: $(jq -c . <<<"${cni_spec}")" + write_file "${output_file}" "${cni_spec}" + + while true; do + echo "Sleeping infinity now." + sleep infinity + done + # In case of anything unexpected, don't fallthrough to the logic below. + exit 1 +fi + +echo "Running CNI watchdog to watch Cilium and manage CNI config at '${output_file}' with content: $(jq -c . <<<"${cni_spec}")" +cilium_watchdog_success_wait=${CILIUM_WATCHDOG_SUCCESS_WAIT:-300} +cilium_watchdog_failure_retry=${CILIUM_WATCHDOG_FAILURE_RETRY:-60} + +if [[ -n "${CILIUM_FAST_START_NAMESPACES:-}" ]]; then + echo "Cilium has fast-start; writing CNI config upfront then start to check Cilium health." + write_file "${output_file}" "${cni_spec}" +fi + +while true; do + echo "Checking Cilium health allowing retries for up to ${cilium_watchdog_failure_retry}s." + if cilium_health_check "${cilium_watchdog_failure_retry}"; then + echo "Cilium healthz reported success; writing CNI config if not already there then wait for ${cilium_watchdog_success_wait}s." + [[ ! -f "${output_file}" ]] && write_file "${output_file}" "${cni_spec}" + sleep "${cilium_watchdog_success_wait}"s + else + echo "Cilium does not appear healthy; removing CNI config if it exists." + rm -f -- "${output_file}" + fi +done + +# In case of anything unexpected, signal failure. +exit 1 diff --git a/scripts/shell-test.sh b/scripts/shell-test.sh index 148f48249..2f5b1209b 100755 --- a/scripts/shell-test.sh +++ b/scripts/shell-test.sh @@ -73,6 +73,9 @@ echo >/netd-test && rm /netd-test && [[ ! -f /netd-test ]] && pass || fail run_test timeout_cmd timeout 2s sleep 1s && pass || fail +run_test sleep_infinity_cmd +timeout 1s sleep infinity && fail || { [[ "$?" == 124 ]] && pass || fail; } + run_test base64_cmd [[ "$(echo -n AAA | base64 -w 0)" == QUFB ]] && pass || fail diff --git a/scripts/test-install-cni.sh b/scripts/test-install-cni.sh index 87bf7bacb..9683ad9a5 100755 --- a/scripts/test-install-cni.sh +++ b/scripts/test-install-cni.sh @@ -41,6 +41,14 @@ Destination Gateway Genmask Flags Metric Ref Use Iface } export -f timeout + # shellcheck disable=SC2317 + function sleep() { + echo "[MOCK called] sleep $*" + echo "[MOCK] sleep shouldn't be called during normal execution; exiting with ${TEST_EXIT_CODE_SLEEP} as a signal." + exit "${TEST_EXIT_CODE_SLEEP}" + } + export -f sleep + function before_test() { echo "no custom init defined for testcase ${testcase}; define custom mocks in before_test() function as needed" } @@ -55,6 +63,8 @@ function cleanup_envs() { CILIUM_FAST_START_NAMESPACES \ CILIUM_HEALTHZ_PORT \ CILIUM_HEALTH_MAX_WAIT_TIME \ + CILIUM_WATCHDOG_FAILURE_RETRY \ + CILIUM_WATCHDOG_SUCCESS_WAIT \ CNI_SPEC_IPV6_ROUTE \ CNI_SPEC_TEMPLATE \ ENABLE_BANDWIDTH_PLUGIN \ @@ -65,10 +75,12 @@ function cleanup_envs() { ISTIO_CNI_CONFIG \ MIGRATE_TO_DPV2 DPV2_MIGRATION_READY \ RETRY_MAX_TIME \ + RUN_CNI_WATCHDOG \ STACK_TYPE \ WRITE_CALICO_CONFIG_FILE } +export TEST_EXIT_CODE_SLEEP=42 FAIL_COUNT=0 @@ -77,7 +89,7 @@ run_test() { } pass() { - echo " PASS" + echo " PASS [$*]" } fail() { @@ -97,6 +109,9 @@ for testcase in testcase/testcase-*.sh ; do # resetting envs cleanup_envs + # allow being overridden in testcase + TEST_WANT_EXIT_CODE=0 + # setting CNI_SPEC_NAME to testcase name (filename in test.out/) CNI_SPEC_NAME="${testcase%.sh}" export CNI_SPEC_NAME="${CNI_SPEC_NAME##*/}" @@ -110,14 +125,14 @@ for testcase in testcase/testcase-*.sh ; do # running install-cni script ./install-cni.sh >>test.log 2>&1 exit_code="$?" - if [ "0" != "${exit_code}" ] ; then + if [ "${TEST_WANT_EXIT_CODE}" != "${exit_code}" ] ; then # script exited with non-zero code - fail "non-zero exit code ($exit_code)" + fail "unexpected exit code ($exit_code) want (${TEST_WANT_EXIT_CODE})" # running testcase verification elif ! verify ; then fail "verification failure" else - pass + pass "${exit_code}" fi done diff --git a/scripts/testcase/testcase-watchdog-cilium-faststart-unhealthy.sh b/scripts/testcase/testcase-watchdog-cilium-faststart-unhealthy.sh new file mode 100644 index 000000000..0bf89a0e0 --- /dev/null +++ b/scripts/testcase/testcase-watchdog-cilium-faststart-unhealthy.sh @@ -0,0 +1,72 @@ +export KUBERNETES_SERVICE_HOST=kubernetes.default.svc +export KUBERNETES_SERVICE_PORT=443 + +export ENABLE_CALICO_NETWORK_POLICY=false +export ENABLE_BANDWIDTH_PLUGIN=false +export ENABLE_CILIUM_PLUGIN=true +export CILIUM_HEALTHZ_PORT=63197 +export CILIUM_FAST_START_NAMESPACES=default,kube-system +export ENABLE_MASQUERADE=false +export ENABLE_IPV6=false +export RUN_CNI_WATCHDOG=true + +CNI_SPEC_TEMPLATE=$(cat testdata/spec-template.json) +export CNI_SPEC_TEMPLATE + +export TEST_WANT_EXIT_CODE=24 + +function before_test() { + + function curl() { + # shellcheck disable=SC2317 + case "$*" in + *http://metadata.google.internal/computeMetadata/v1/instance/network-interfaces/0*) + echo '{"ipv6s": ["2600:1900:4000:318:0:7:0:0"]}' + ;; + *https://kubernetes.default.svc:443/api/v1/nodes/*) + echo '{ + "metadata": { + "labels": { + }, + "creationTimestamp": "2024-01-03T11:54:01Z", + "name": "gke-my-cluster-default-pool-128bc25d-9c94", + "resourceVersion": "891003", + "uid": "f2353a2f-ca8c-4ca0-8dd3-ad1f964a54f0" + }, + "spec": { + "podCIDR": "10.52.1.0/24", + "podCIDRs": [ + "10.52.1.0/24" + ], + "providerID": "gce://my-gke-project/us-central1-c/gke-my-cluster-default-pool-128bc25d-9c94" + } + }' + ;; + *http://localhost:63197/*) + # Return unhealthy on the first attempt, then exit on the following. + if [[ "${TEST_CILIUM_HEALTH_CHECKED:-}" == "true" ]]; then + exit "${TEST_WANT_EXIT_CODE}" + fi + TEST_CILIUM_HEALTH_CHECKED=true + return 1 + ;; + *) + #unsupported + exit 1 + esac + } + export -f curl + +} + +function verify() { + local actual + + if [[ -f "/host/etc/cni/net.d/${CNI_SPEC_NAME}" ]]; then + actual=$(jq -S . <"/host/etc/cni/net.d/${CNI_SPEC_NAME}") + echo "Expected CNI config to be missing, but it has:" + echo "$actual" + return 1 + fi + +} diff --git a/scripts/testcase/testcase-watchdog-cilium-faststart.sh b/scripts/testcase/testcase-watchdog-cilium-faststart.sh new file mode 100644 index 000000000..1d7a0c5e3 --- /dev/null +++ b/scripts/testcase/testcase-watchdog-cilium-faststart.sh @@ -0,0 +1,74 @@ +export KUBERNETES_SERVICE_HOST=kubernetes.default.svc +export KUBERNETES_SERVICE_PORT=443 + +export ENABLE_CALICO_NETWORK_POLICY=false +export ENABLE_BANDWIDTH_PLUGIN=false +export ENABLE_CILIUM_PLUGIN=true +export CILIUM_HEALTHZ_PORT=63197 +export CILIUM_FAST_START_NAMESPACES=default,kube-system +export ENABLE_MASQUERADE=false +export ENABLE_IPV6=false +export RUN_CNI_WATCHDOG=true + +CNI_SPEC_TEMPLATE=$(cat testdata/spec-template.json) +export CNI_SPEC_TEMPLATE + +export TEST_WANT_EXIT_CODE=24 + +function before_test() { + + function curl() { + # shellcheck disable=SC2317 + case "$*" in + *http://metadata.google.internal/computeMetadata/v1/instance/network-interfaces/0*) + echo '{"ipv6s": ["2600:1900:4000:318:0:7:0:0"]}' + ;; + *https://kubernetes.default.svc:443/api/v1/nodes/*) + echo '{ + "metadata": { + "labels": { + }, + "creationTimestamp": "2024-01-03T11:54:01Z", + "name": "gke-my-cluster-default-pool-128bc25d-9c94", + "resourceVersion": "891003", + "uid": "f2353a2f-ca8c-4ca0-8dd3-ad1f964a54f0" + }, + "spec": { + "podCIDR": "10.52.1.0/24", + "podCIDRs": [ + "10.52.1.0/24" + ], + "providerID": "gce://my-gke-project/us-central1-c/gke-my-cluster-default-pool-128bc25d-9c94" + } + }' + ;; + *http://localhost:63197/*) + # With fast-start enabled, CNI config should have been written + # at the first Cilium health check attempt. + exit "${TEST_WANT_EXIT_CODE}" + ;; + *) + #unsupported + exit 1 + esac + } + export -f curl + +} + +function verify() { + local expected + local actual + + expected=$(jq -S . <"testdata/expected-cilium-faststart.json") + actual=$(jq -S . <"/host/etc/cni/net.d/${CNI_SPEC_NAME}") + + if [ "$expected" != "$actual" ] ; then + echo "Expected cni_spec value:" + echo "$expected" + echo "but actual was" + echo "$actual" + return 1 + fi + +} diff --git a/scripts/testcase/testcase-watchdog-cilium-unhealthy.sh b/scripts/testcase/testcase-watchdog-cilium-unhealthy.sh new file mode 100644 index 000000000..bf73d59f2 --- /dev/null +++ b/scripts/testcase/testcase-watchdog-cilium-unhealthy.sh @@ -0,0 +1,72 @@ +export KUBERNETES_SERVICE_HOST=kubernetes.default.svc +export KUBERNETES_SERVICE_PORT=443 + +export ENABLE_CALICO_NETWORK_POLICY=false +export ENABLE_BANDWIDTH_PLUGIN=false +export ENABLE_CILIUM_PLUGIN=true +export CILIUM_HEALTHZ_PORT=63197 +export CILIUM_FAST_START_NAMESPACES= +export ENABLE_MASQUERADE=false +export ENABLE_IPV6=false +export RUN_CNI_WATCHDOG=true + +CNI_SPEC_TEMPLATE=$(cat testdata/spec-template.json) +export CNI_SPEC_TEMPLATE + +export TEST_WANT_EXIT_CODE=24 + +function before_test() { + + function curl() { + # shellcheck disable=SC2317 + case "$*" in + *http://metadata.google.internal/computeMetadata/v1/instance/network-interfaces/0*) + echo '{"ipv6s": ["2600:1900:4000:318:0:7:0:0"]}' + ;; + *https://kubernetes.default.svc:443/api/v1/nodes/*) + echo '{ + "metadata": { + "labels": { + }, + "creationTimestamp": "2024-01-03T11:54:01Z", + "name": "gke-my-cluster-default-pool-128bc25d-9c94", + "resourceVersion": "891003", + "uid": "f2353a2f-ca8c-4ca0-8dd3-ad1f964a54f0" + }, + "spec": { + "podCIDR": "10.52.1.0/24", + "podCIDRs": [ + "10.52.1.0/24" + ], + "providerID": "gce://my-gke-project/us-central1-c/gke-my-cluster-default-pool-128bc25d-9c94" + } + }' + ;; + *http://localhost:63197/*) + # Return unhealthy on the first attempt, then exit on the following. + if [[ "${TEST_CILIUM_HEALTH_CHECKED:-}" == "true" ]]; then + exit "${TEST_WANT_EXIT_CODE}" + fi + TEST_CILIUM_HEALTH_CHECKED=true + return 1 + ;; + *) + #unsupported + exit 1 + esac + } + export -f curl + +} + +function verify() { + local actual + + if [[ -f "/host/etc/cni/net.d/${CNI_SPEC_NAME}" ]]; then + actual=$(jq -S . <"/host/etc/cni/net.d/${CNI_SPEC_NAME}") + echo "Expected CNI config to be missing, but it has:" + echo "$actual" + return 1 + fi + +} diff --git a/scripts/testcase/testcase-watchdog-cilium.sh b/scripts/testcase/testcase-watchdog-cilium.sh new file mode 100644 index 000000000..810d8776e --- /dev/null +++ b/scripts/testcase/testcase-watchdog-cilium.sh @@ -0,0 +1,73 @@ +export KUBERNETES_SERVICE_HOST=kubernetes.default.svc +export KUBERNETES_SERVICE_PORT=443 + +export ENABLE_CALICO_NETWORK_POLICY=false +export ENABLE_BANDWIDTH_PLUGIN=false +export ENABLE_CILIUM_PLUGIN=true +export CILIUM_HEALTHZ_PORT=63197 +export CILIUM_FAST_START_NAMESPACES= +export ENABLE_MASQUERADE=false +export ENABLE_IPV6=false +export RUN_CNI_WATCHDOG=true + +CNI_SPEC_TEMPLATE=$(cat testdata/spec-template.json) +export CNI_SPEC_TEMPLATE + +# shellcheck disable=SC2034 +TEST_WANT_EXIT_CODE=${TEST_EXIT_CODE_SLEEP} + +function before_test() { + + function curl() { + # shellcheck disable=SC2317 + case "$*" in + *http://metadata.google.internal/computeMetadata/v1/instance/network-interfaces/0*) + echo '{"ipv6s": ["2600:1900:4000:318:0:7:0:0"]}' + ;; + *https://kubernetes.default.svc:443/api/v1/nodes/*) + echo '{ + "metadata": { + "labels": { + }, + "creationTimestamp": "2024-01-03T11:54:01Z", + "name": "gke-my-cluster-default-pool-128bc25d-9c94", + "resourceVersion": "891003", + "uid": "f2353a2f-ca8c-4ca0-8dd3-ad1f964a54f0" + }, + "spec": { + "podCIDR": "10.52.1.0/24", + "podCIDRs": [ + "10.52.1.0/24" + ], + "providerID": "gce://my-gke-project/us-central1-c/gke-my-cluster-default-pool-128bc25d-9c94" + } + }' + ;; + *http://localhost:63197/*) + echo 'healthz' + ;; + *) + #unsupported + exit 1 + esac + } + export -f curl + +} + +function verify() { + local expected + local actual + + expected=$(jq -S . <"testdata/expected-cilium.json") + actual=$(jq -S . <"/host/etc/cni/net.d/${CNI_SPEC_NAME}") + + if [ "$expected" != "$actual" ] ; then + echo "Expected cni_spec value:" + echo "$expected" + echo "but actual was" + echo "$actual" + return 1 + fi + +} diff --git a/scripts/testcase/testcase-watchdog.sh b/scripts/testcase/testcase-watchdog.sh new file mode 100644 index 000000000..3f7a1ade4 --- /dev/null +++ b/scripts/testcase/testcase-watchdog.sh @@ -0,0 +1,68 @@ +export KUBERNETES_SERVICE_HOST=kubernetes.default.svc +export KUBERNETES_SERVICE_PORT=443 + +export ENABLE_CALICO_NETWORK_POLICY=false +export ENABLE_BANDWIDTH_PLUGIN=false +export ENABLE_CILIUM_PLUGIN=false +export ENABLE_MASQUERADE=false +export ENABLE_IPV6=false +export RUN_CNI_WATCHDOG=true + +CNI_SPEC_TEMPLATE=$(cat testdata/spec-template.json) +export CNI_SPEC_TEMPLATE + +# shellcheck disable=SC2034 +TEST_WANT_EXIT_CODE=${TEST_EXIT_CODE_SLEEP} + +function before_test() { + + function curl() { + # shellcheck disable=SC2317 + case "$*" in + *http://metadata.google.internal/computeMetadata/v1/instance/network-interfaces/0*) + echo '{"ipv6s": ["2600:1900:4000:318:0:7:0:0"]}' + ;; + *https://kubernetes.default.svc:443/api/v1/nodes/*) + echo '{ + "metadata": { + "labels": { + }, + "creationTimestamp": "2024-01-03T11:54:01Z", + "name": "gke-my-cluster-default-pool-128bc25d-9c94", + "resourceVersion": "891003", + "uid": "f2353a2f-ca8c-4ca0-8dd3-ad1f964a54f0" + }, + "spec": { + "podCIDR": "10.52.1.0/24", + "podCIDRs": [ + "10.52.1.0/24" + ], + "providerID": "gce://my-gke-project/us-central1-c/gke-my-cluster-default-pool-128bc25d-9c94" + } + }' + ;; + *) + #unsupported + exit 1 + esac + } + export -f curl + +} + +function verify() { + local expected + local actual + + expected=$(jq -S . <"testdata/expected-basic.json") + actual=$(jq -S . <"/host/etc/cni/net.d/${CNI_SPEC_NAME}") + + if [ "$expected" != "$actual" ] ; then + echo "Expected cni_spec value:" + echo "$expected" + echo "but actual was" + echo "$actual" + return 1 + fi + +}