From 9a0262605bf8f68d9f57f56b0ca555544bd6d835 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Fri, 1 Nov 2024 08:13:08 +0100 Subject: [PATCH 01/22] Add e2e test concurrency w/ signal This will help make sure the big refactoring does not break the main features. Signed-off-by: Jean-Philippe Evrard --- .github/workflows/on-pr.yaml | 3 +- Makefile | 6 ++-- tests/kind/main_test.go | 58 +++++++++++++++++++++++++++++++++--- 3 files changed, 60 insertions(+), 7 deletions(-) diff --git a/.github/workflows/on-pr.yaml b/.github/workflows/on-pr.yaml index 86973d425..e505be769 100644 --- a/.github/workflows/on-pr.yaml +++ b/.github/workflows/on-pr.yaml @@ -88,6 +88,7 @@ jobs: - "TestE2EWithCommand" - "TestE2EWithSignal" - "TestE2EConcurrentWithCommand" + - "TestE2EConcurrentWithSignal" kubernetes_version: - "previous" - "current" @@ -148,4 +149,4 @@ jobs: install_only: true version: v0.22.0 - name: Run specific e2e tests - run: make e2e-test ARGS="-run ^${{ matrix.testname }}" \ No newline at end of file + run: make e2e-test ARGS="-run ^${{ matrix.testname }}" diff --git a/Makefile b/Makefile index 39cd5d5fe..591cab08b 100644 --- a/Makefile +++ b/Makefile @@ -47,8 +47,10 @@ dev-manifest: sed -e "s#image: ghcr.io/.*kured.*#image: kured:dev#g" -e 's/#\(.*\)--period=1h/\1--period=20s/g' kured-ds.yaml > tests/kind/testfiles/kured-ds.yaml # signal e2e scenario sed -e "s#image: ghcr.io/.*kured.*#image: kured:dev#g" -e 's/#\(.*\)--period=1h/\1--period=20s/g' kured-ds-signal.yaml > tests/kind/testfiles/kured-ds-signal.yaml - # concurrency e2e scenario - sed -e "s#image: ghcr.io/.*kured.*#image: kured:dev#g" -e 's/#\(.*\)--period=1h/\1--period=20s/g' -e 's/#\(.*\)--concurrency=1/\1--concurrency=2/g' kured-ds.yaml > tests/kind/testfiles/kured-ds-concurrent.yaml + # concurrency e2e command scenario + sed -e "s#image: ghcr.io/.*kured.*#image: kured:dev#g" -e 's/#\(.*\)--period=1h/\1--period=20s/g' -e 's/#\(.*\)--concurrency=1/\1--concurrency=2/g' kured-ds.yaml > tests/kind/testfiles/kured-ds-concurrent-command.yaml + # concurrency e2e signal scenario + sed -e "s#image: ghcr.io/.*kured.*#image: kured:dev#g" -e 's/#\(.*\)--period=1h/\1--period=20s/g' -e 's/#\(.*\)--concurrency=1/\1--concurrency=2/g' kured-ds-signal.yaml > tests/kind/testfiles/kured-ds-concurrent-signal.yaml e2e-test: dev-manifest dev-image diff --git a/tests/kind/main_test.go b/tests/kind/main_test.go index 0a13c00f9..eea0faf3a 100644 --- a/tests/kind/main_test.go +++ b/tests/kind/main_test.go @@ -257,7 +257,57 @@ func TestE2EConcurrentWithCommand(t *testing.T) { kindClusterConfigFile := fmt.Sprintf("../../.github/kind-cluster-%v.yaml", version) kindContext := fmt.Sprintf("kind-%v", kindClusterName) - k := NewKindTester(kindClusterName, kindClusterConfigFile, t, LocalImage(kuredDevImage), Deploy("../../kured-rbac.yaml"), Deploy("testfiles/kured-ds-concurrent.yaml")) + k := NewKindTester(kindClusterName, kindClusterConfigFile, t, LocalImage(kuredDevImage), Deploy("../../kured-rbac.yaml"), Deploy("testfiles/kured-ds-concurrent-command.yaml")) + defer k.FlushLog() + + err := k.Create() + if err != nil { + t.Fatalf("Error creating cluster %v", err) + } + defer func(k *KindTest) { + err := k.Destroy() + if err != nil { + t.Fatalf("Error destroying cluster %v", err) + } + }(k) + + k.Write([]byte("Now running e2e tests")) + + if err := k.RunCmd("bash", "testfiles/create-reboot-sentinels.sh", kindContext); err != nil { + t.Fatalf("failed to create sentinels: %v", err) + } + + if err := k.RunCmd("bash", "testfiles/follow-coordinated-reboot.sh", kindContext); err != nil { + t.Fatalf("failed to follow reboot: %v", err) + } + }) + } +} + +func TestE2EConcurrentWithSignal(t *testing.T) { + t.Parallel() + if testing.Short() { + t.Skip("skipping test in short mode.") + } + + var kindClusterConfigs = []string{ + "previous", + "current", + "next", + } + // Iterate over each Kubernetes version + for _, version := range kindClusterConfigs { + version := version + // Define a subtest for each combination + t.Run(version, func(t *testing.T) { + t.Parallel() // Allow tests to run in parallel + + randomInt := fmt.Sprintf(strconv.Itoa(rand.Intn(100))) + kindClusterName := fmt.Sprintf("kured-e2e-concurrentsignal-%v-%v", version, randomInt) + kindClusterConfigFile := fmt.Sprintf("../../.github/kind-cluster-%v.yaml", version) + kindContext := fmt.Sprintf("kind-%v", kindClusterName) + + k := NewKindTester(kindClusterName, kindClusterConfigFile, t, LocalImage(kuredDevImage), Deploy("../../kured-rbac.yaml"), Deploy("testfiles/kured-ds-concurrent-signal.yaml")) defer k.FlushLog() err := k.Create() @@ -307,10 +357,10 @@ func TestCordonningIsKept(t *testing.T) { kindContext := fmt.Sprintf("kind-%v", kindClusterName) var manifest string - if version == "concurrency1" { - manifest = fmt.Sprintf("testfiles/kured-ds.yaml") + if variant == "concurrency1" { + manifest = fmt.Sprintf("testfiles/kured-ds-signal.yaml") } else { - manifest = fmt.Sprintf("testfiles/kured-ds-concurrent.yaml") + manifest = fmt.Sprintf("testfiles/kured-ds-concurrent-signal.yaml") } k := NewKindTester(kindClusterName, kindClusterConfigFile, t, LocalImage(kuredDevImage), Deploy("../../kured-rbac.yaml"), Deploy(manifest)) defer k.FlushLog() From 27c9c5d36b6043df89870883e1754d890c9078cd Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Fri, 1 Nov 2024 08:32:34 +0100 Subject: [PATCH 02/22] Add podblocker test Extends test coverage to ensure nothing breaks Signed-off-by: Jean-Philippe Evrard --- .github/workflows/on-pr.yaml | 5 +++ Makefile | 3 +- tests/kind/main_test.go | 43 ++++++++++++++++++++++++ tests/kind/testfiles/podblocker.sh | 54 ++++++++++++++++++++++++++++++ 4 files changed, 104 insertions(+), 1 deletion(-) create mode 100755 tests/kind/testfiles/podblocker.sh diff --git a/.github/workflows/on-pr.yaml b/.github/workflows/on-pr.yaml index e505be769..74ebe8a5f 100644 --- a/.github/workflows/on-pr.yaml +++ b/.github/workflows/on-pr.yaml @@ -127,6 +127,7 @@ jobs: testname: - "TestCordonningIsKept/concurrency1" - "TestCordonningIsKept/concurrency2" + - "TestE2EBlocker/podblocker" steps: - uses: actions/checkout@v4 - name: Ensure go version @@ -148,5 +149,9 @@ jobs: with: install_only: true version: v0.22.0 + # Keep this until v1.31 (or superior) becomes the default kubectl version for the kind-action. + # It is used in podblocker shell script test to use --all-pods. + # If the podblocker e2e test relies on another way, this can also be removed. + kubectl_version: v1.31.0 - name: Run specific e2e tests run: make e2e-test ARGS="-run ^${{ matrix.testname }}" diff --git a/Makefile b/Makefile index 591cab08b..a7dabc9a9 100644 --- a/Makefile +++ b/Makefile @@ -51,7 +51,8 @@ dev-manifest: sed -e "s#image: ghcr.io/.*kured.*#image: kured:dev#g" -e 's/#\(.*\)--period=1h/\1--period=20s/g' -e 's/#\(.*\)--concurrency=1/\1--concurrency=2/g' kured-ds.yaml > tests/kind/testfiles/kured-ds-concurrent-command.yaml # concurrency e2e signal scenario sed -e "s#image: ghcr.io/.*kured.*#image: kured:dev#g" -e 's/#\(.*\)--period=1h/\1--period=20s/g' -e 's/#\(.*\)--concurrency=1/\1--concurrency=2/g' kured-ds-signal.yaml > tests/kind/testfiles/kured-ds-concurrent-signal.yaml - + # pod blocker e2e signal scenario + sed -e "s#image: ghcr.io/.*kured.*#image: kured:dev#g" -e 's/#\(.*\)--period=1h/\1--period=20s/g' -e 's/#\(.*\)--blocking-pod-selector=name=temperamental/\1--blocking-pod-selector=app=blocker/g' kured-ds-signal.yaml > tests/kind/testfiles/kured-ds-podblocker.yaml e2e-test: dev-manifest dev-image echo "Running ALL go tests" diff --git a/tests/kind/main_test.go b/tests/kind/main_test.go index eea0faf3a..4f5f510d0 100644 --- a/tests/kind/main_test.go +++ b/tests/kind/main_test.go @@ -384,3 +384,46 @@ func TestCordonningIsKept(t *testing.T) { }) } } +func TestE2EBlocker(t *testing.T) { + t.Parallel() + if testing.Short() { + t.Skip("skipping test in short mode.") + } + + var kindClusterConfigs = []string{ + "podblocker", + } + // Iterate over each Kubernetes version + for _, version := range kindClusterConfigs { + version := version + // Define a subtest for each combination + t.Run(version, func(t *testing.T) { + t.Parallel() // Allow tests to run in parallel + + randomInt := fmt.Sprintf(strconv.Itoa(rand.Intn(100))) + kindClusterName := fmt.Sprintf("kured-e2e-cordon-%v-%v", version, randomInt) + kindClusterConfigFile := fmt.Sprintf("../../.github/kind-cluster-next.yaml") + kindContext := fmt.Sprintf("kind-%v", kindClusterName) + + k := NewKindTester(kindClusterName, kindClusterConfigFile, t, LocalImage(kuredDevImage), Deploy("../../kured-rbac.yaml"), Deploy(fmt.Sprintf("testfiles/kured-ds-%v.yaml", version))) + defer k.FlushLog() + + err := k.Create() + if err != nil { + t.Fatalf("Error creating cluster %v", err) + } + defer func(k *KindTest) { + err := k.Destroy() + if err != nil { + t.Fatalf("Error destroying cluster %v", err) + } + }(k) + + k.Write([]byte("Now running e2e tests")) + + if err := k.RunCmd("bash", fmt.Sprintf("testfiles/%v.sh",version), kindContext); err != nil { + t.Fatalf("node blocker test did not succeed: %v", err) + } + }) + } +} diff --git a/tests/kind/testfiles/podblocker.sh b/tests/kind/testfiles/podblocker.sh new file mode 100755 index 000000000..66bf6d543 --- /dev/null +++ b/tests/kind/testfiles/podblocker.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash + +kubectl_flags=( ) +[[ "$1" != "" ]] && kubectl_flags=("${kubectl_flags[@]}" --context "$1") + +function gather_logs_and_cleanup { + for id in $(docker ps -q); do + echo "############################################################" + echo "docker logs for container $id:" + docker logs "$id" + done + ${KUBECTL_CMD:-kubectl} "${kubectl_flags[@]}" logs ds/kured --all-pods -n kube-system +} +trap gather_logs_and_cleanup EXIT + +set +o errexit +worker=$(${KUBECTL_CMD:-kubectl} "${kubectl_flags[@]}" get nodes -o custom-columns=name:metadata.name --no-headers | grep worker | head -n 1) + +${KUBECTL_CMD:-kubectl} "${kubectl_flags[@]}" label nodes "$worker" blocked-host=yes + +${KUBECTL_CMD:-kubectl} "${kubectl_flags[@]}" apply -f - << EOF +apiVersion: v1 +kind: Pod +metadata: + name: nginx + labels: + app: blocker +spec: + containers: + - name: nginx + image: nginx + imagePullPolicy: IfNotPresent + nodeSelector: + blocked-host: "yes" +EOF + +docker exec "$worker" touch "${SENTINEL_FILE:-/var/run/reboot-required}" + +set -o errexit +max_attempts="100" +attempt_num=1 +sleep_time=5 + +until ${KUBECTL_CMD:-kubectl} "${kubectl_flags[@]}" logs ds/kured --all-pods -n kube-system | grep -i -e "Reboot.*blocked" +do + if (( attempt_num == max_attempts )); then + echo "Attempt $attempt_num failed and there are no more attempts left!" + exit 1 + else + echo "Did not find 'reboot blocked' in the log, retrying in $sleep_time seconds (Attempt #$attempt_num)" + sleep "$sleep_time" + fi + (( attempt_num++ )) +done From 0a1750dc6dc37f703ac811520ce15055a5b98764 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Fri, 1 Nov 2024 08:34:42 +0100 Subject: [PATCH 03/22] Rename "version" with "variant" in tests For tests not running in different kubernetes versions, but have different tests subcases/variants, rephrase the wording "versions" as it is confusing. Signed-off-by: Jean-Philippe Evrard --- tests/kind/main_test.go | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/kind/main_test.go b/tests/kind/main_test.go index 4f5f510d0..c09c71e16 100644 --- a/tests/kind/main_test.go +++ b/tests/kind/main_test.go @@ -344,15 +344,15 @@ func TestCordonningIsKept(t *testing.T) { "concurrency1", "concurrency2", } - // Iterate over each Kubernetes version - for _, version := range kindClusterConfigs { - version := version + // Iterate over each test variant + for _, variant := range kindClusterConfigs { + variant := variant // Define a subtest for each combination - t.Run(version, func(t *testing.T) { + t.Run(variant, func(t *testing.T) { t.Parallel() // Allow tests to run in parallel randomInt := fmt.Sprintf(strconv.Itoa(rand.Intn(100))) - kindClusterName := fmt.Sprintf("kured-e2e-cordon-%v-%v", version, randomInt) + kindClusterName := fmt.Sprintf("kured-e2e-cordon-%v-%v", variant, randomInt) kindClusterConfigFile := fmt.Sprintf("../../.github/kind-cluster-next.yaml") kindContext := fmt.Sprintf("kind-%v", kindClusterName) @@ -393,19 +393,19 @@ func TestE2EBlocker(t *testing.T) { var kindClusterConfigs = []string{ "podblocker", } - // Iterate over each Kubernetes version - for _, version := range kindClusterConfigs { - version := version + // Iterate over each variant of the test + for _, variant := range kindClusterConfigs { + variant := variant // Define a subtest for each combination - t.Run(version, func(t *testing.T) { + t.Run(variant, func(t *testing.T) { t.Parallel() // Allow tests to run in parallel randomInt := fmt.Sprintf(strconv.Itoa(rand.Intn(100))) - kindClusterName := fmt.Sprintf("kured-e2e-cordon-%v-%v", version, randomInt) + kindClusterName := fmt.Sprintf("kured-e2e-cordon-%v-%v", variant, randomInt) kindClusterConfigFile := fmt.Sprintf("../../.github/kind-cluster-next.yaml") kindContext := fmt.Sprintf("kind-%v", kindClusterName) - k := NewKindTester(kindClusterName, kindClusterConfigFile, t, LocalImage(kuredDevImage), Deploy("../../kured-rbac.yaml"), Deploy(fmt.Sprintf("testfiles/kured-ds-%v.yaml", version))) + k := NewKindTester(kindClusterName, kindClusterConfigFile, t, LocalImage(kuredDevImage), Deploy("../../kured-rbac.yaml"), Deploy(fmt.Sprintf("testfiles/kured-ds-%v.yaml", variant))) defer k.FlushLog() err := k.Create() @@ -421,7 +421,7 @@ func TestE2EBlocker(t *testing.T) { k.Write([]byte("Now running e2e tests")) - if err := k.RunCmd("bash", fmt.Sprintf("testfiles/%v.sh",version), kindContext); err != nil { + if err := k.RunCmd("bash", fmt.Sprintf("testfiles/%v.sh",variant), kindContext); err != nil { t.Fatalf("node blocker test did not succeed: %v", err) } }) From 6170770484057c7b5df646c4e1e1113b135da10f Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Thu, 7 Nov 2024 19:34:03 +0100 Subject: [PATCH 04/22] Fix Staticcheck's SA1024 (subset with dupe chars) This will replace trim, taking a cutset, with Replace. This clarifies the intent to remove a substring. Signed-off-by: Jean-Philippe Evrard --- cmd/kured/main.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmd/kured/main.go b/cmd/kured/main.go index b3412932a..a6b1c0e82 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -320,11 +320,11 @@ func validateNotificationURL(notifyURL string, slackHookURL string) string { log.Errorf("slack-hook-url is not properly formatted... no notification will be sent: %v\n", err) return "" } - if len(strings.Split(strings.Trim(parsedURL.Path, "/services/"), "/")) != 3 { + if len(strings.Split(strings.Replace(parsedURL.Path, "/services/", "", -1), "/")) != 3 { log.Errorf("slack-hook-url is not properly formatted... no notification will be sent: unexpected number of / in URL\n") return "" } - return fmt.Sprintf("slack://%s", strings.Trim(parsedURL.Path, "/services/")) + return fmt.Sprintf("slack://%s", strings.Replace(parsedURL.Path, "/services/", "", -1)) } return "" } From ca73c4a0731cb210a45c49ee6566578b08f8cc7a Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Thu, 7 Nov 2024 19:38:38 +0100 Subject: [PATCH 05/22] Fix Staticcheck's ST1005 According to staticcheck, Error strings should not be capitalized (ST1005). This changes the cases for our errors. Signed-off-by: Jean-Philippe Evrard --- pkg/daemonsetlock/daemonsetlock.go | 4 ++-- pkg/timewindow/days.go | 4 ++-- pkg/timewindow/timewindow.go | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pkg/daemonsetlock/daemonsetlock.go b/pkg/daemonsetlock/daemonsetlock.go index c0da3a59c..ba300f9de 100644 --- a/pkg/daemonsetlock/daemonsetlock.go +++ b/pkg/daemonsetlock/daemonsetlock.go @@ -214,10 +214,10 @@ func (dsl *DaemonSetSingleLock) Release() error { } if value.NodeID != dsl.nodeID { - return fmt.Errorf("Not lock holder: %v", value.NodeID) + return fmt.Errorf("not lock holder: %v", value.NodeID) } } else { - return fmt.Errorf("Lock not held") + return fmt.Errorf("lock not held") } delete(ds.ObjectMeta.Annotations, dsl.annotation) diff --git a/pkg/timewindow/days.go b/pkg/timewindow/days.go index 9d0e8b956..8a46e054d 100644 --- a/pkg/timewindow/days.go +++ b/pkg/timewindow/days.go @@ -81,11 +81,11 @@ func parseWeekday(day string) (time.Weekday, error) { if n >= 0 && n < 7 { return time.Weekday(n), nil } - return time.Sunday, fmt.Errorf("Invalid weekday, number out of range: %s", day) + return time.Sunday, fmt.Errorf("invalid weekday, number out of range: %s", day) } if weekday, ok := dayStrings[strings.ToLower(day)]; ok { return weekday, nil } - return time.Sunday, fmt.Errorf("Invalid weekday: %s", day) + return time.Sunday, fmt.Errorf("invalid weekday: %s", day) } diff --git a/pkg/timewindow/timewindow.go b/pkg/timewindow/timewindow.go index 13e24d8dd..973980ead 100644 --- a/pkg/timewindow/timewindow.go +++ b/pkg/timewindow/timewindow.go @@ -77,5 +77,5 @@ func parseTime(s string, loc *time.Location) (time.Time, error) { } } - return time.Now(), fmt.Errorf("Invalid time format: %s", s) + return time.Now(), fmt.Errorf("invalid time format: %s", s) } From 3cb82a0da18adb93c6626fdef9d1c5ad2202eda6 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Thu, 7 Nov 2024 19:46:38 +0100 Subject: [PATCH 06/22] Fix incorrect string prints A few strings have evolved to eventually remove all the templating part of their strings, yet kept the formatting features. This is incorrect, and will not pass staticcheck SA1006 and S1039. Signed-off-by: Jean-Philippe Evrard --- tests/kind/main_test.go | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/kind/main_test.go b/tests/kind/main_test.go index c09c71e16..fec78a917 100644 --- a/tests/kind/main_test.go +++ b/tests/kind/main_test.go @@ -152,7 +152,7 @@ func TestE2EWithCommand(t *testing.T) { t.Run(version, func(t *testing.T) { t.Parallel() // Allow tests to run in parallel - randomInt := fmt.Sprintf(strconv.Itoa(rand.Intn(100))) + randomInt := strconv.Itoa(rand.Intn(100)) kindClusterName := fmt.Sprintf("kured-e2e-command-%v-%v", version, randomInt) kindClusterConfigFile := fmt.Sprintf("../../.github/kind-cluster-%v.yaml", version) kindContext := fmt.Sprintf("kind-%v", kindClusterName) @@ -202,7 +202,7 @@ func TestE2EWithSignal(t *testing.T) { t.Run(version, func(t *testing.T) { t.Parallel() // Allow tests to run in parallel - randomInt := fmt.Sprintf(strconv.Itoa(rand.Intn(100))) + randomInt := strconv.Itoa(rand.Intn(100)) kindClusterName := fmt.Sprintf("kured-e2e-signal-%v-%v", version, randomInt) kindClusterConfigFile := fmt.Sprintf("../../.github/kind-cluster-%v.yaml", version) kindContext := fmt.Sprintf("kind-%v", kindClusterName) @@ -252,7 +252,7 @@ func TestE2EConcurrentWithCommand(t *testing.T) { t.Run(version, func(t *testing.T) { t.Parallel() // Allow tests to run in parallel - randomInt := fmt.Sprintf(strconv.Itoa(rand.Intn(100))) + randomInt := strconv.Itoa(rand.Intn(100)) kindClusterName := fmt.Sprintf("kured-e2e-concurrentcommand-%v-%v", version, randomInt) kindClusterConfigFile := fmt.Sprintf("../../.github/kind-cluster-%v.yaml", version) kindContext := fmt.Sprintf("kind-%v", kindClusterName) @@ -302,7 +302,7 @@ func TestE2EConcurrentWithSignal(t *testing.T) { t.Run(version, func(t *testing.T) { t.Parallel() // Allow tests to run in parallel - randomInt := fmt.Sprintf(strconv.Itoa(rand.Intn(100))) + randomInt := strconv.Itoa(rand.Intn(100)) kindClusterName := fmt.Sprintf("kured-e2e-concurrentsignal-%v-%v", version, randomInt) kindClusterConfigFile := fmt.Sprintf("../../.github/kind-cluster-%v.yaml", version) kindContext := fmt.Sprintf("kind-%v", kindClusterName) @@ -351,16 +351,16 @@ func TestCordonningIsKept(t *testing.T) { t.Run(variant, func(t *testing.T) { t.Parallel() // Allow tests to run in parallel - randomInt := fmt.Sprintf(strconv.Itoa(rand.Intn(100))) + randomInt := strconv.Itoa(rand.Intn(100)) kindClusterName := fmt.Sprintf("kured-e2e-cordon-%v-%v", variant, randomInt) - kindClusterConfigFile := fmt.Sprintf("../../.github/kind-cluster-next.yaml") + kindClusterConfigFile := "../../.github/kind-cluster-next.yaml" kindContext := fmt.Sprintf("kind-%v", kindClusterName) var manifest string if variant == "concurrency1" { - manifest = fmt.Sprintf("testfiles/kured-ds-signal.yaml") + manifest = "testfiles/kured-ds-signal.yaml" } else { - manifest = fmt.Sprintf("testfiles/kured-ds-concurrent-signal.yaml") + manifest = "testfiles/kured-ds-concurrent-signal.yaml" } k := NewKindTester(kindClusterName, kindClusterConfigFile, t, LocalImage(kuredDevImage), Deploy("../../kured-rbac.yaml"), Deploy(manifest)) defer k.FlushLog() @@ -400,9 +400,9 @@ func TestE2EBlocker(t *testing.T) { t.Run(variant, func(t *testing.T) { t.Parallel() // Allow tests to run in parallel - randomInt := fmt.Sprintf(strconv.Itoa(rand.Intn(100))) + randomInt := strconv.Itoa(rand.Intn(100)) kindClusterName := fmt.Sprintf("kured-e2e-cordon-%v-%v", variant, randomInt) - kindClusterConfigFile := fmt.Sprintf("../../.github/kind-cluster-next.yaml") + kindClusterConfigFile := "../../.github/kind-cluster-next.yaml" kindContext := fmt.Sprintf("kind-%v", kindClusterName) k := NewKindTester(kindClusterName, kindClusterConfigFile, t, LocalImage(kuredDevImage), Deploy("../../kured-rbac.yaml"), Deploy(fmt.Sprintf("testfiles/kured-ds-%v.yaml", variant))) @@ -421,7 +421,7 @@ func TestE2EBlocker(t *testing.T) { k.Write([]byte("Now running e2e tests")) - if err := k.RunCmd("bash", fmt.Sprintf("testfiles/%v.sh",variant), kindContext); err != nil { + if err := k.RunCmd("bash", fmt.Sprintf("testfiles/%v.sh", variant), kindContext); err != nil { t.Fatalf("node blocker test did not succeed: %v", err) } }) From 76765f5dd130910e9d5b643888e8fcc3d37329ef Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Thu, 7 Nov 2024 19:50:15 +0100 Subject: [PATCH 07/22] Add staticcheck in make tests Without this, people like myself will forget to run staticcheck. This fixes it by making it part of make tests, which will run with all the fast tests in CI. Signed-off-by: Jean-Philippe Evrard --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index a7dabc9a9..c81b1d31a 100644 --- a/Makefile +++ b/Makefile @@ -19,7 +19,7 @@ bootstrap-tools: $(HACKDIR) command -v $(HACKDIR)/cosign || curl -sSfL https://github.com/sigstore/cosign/releases/download/v2.2.3/cosign-linux-amd64 -o $(HACKDIR)/cosign command -v $(HACKDIR)/shellcheck || (curl -sSfL https://github.com/koalaman/shellcheck/releases/download/stable/shellcheck-stable.linux.x86_64.tar.xz | tar -J -v -x shellcheck-stable/shellcheck && mv shellcheck-stable/shellcheck $(HACKDIR)/shellcheck && rmdir shellcheck-stable) chmod +x $(HACKDIR)/goreleaser $(HACKDIR)/cosign $(HACKDIR)/syft $(HACKDIR)/shellcheck - # go install honnef.co/go/tools/cmd/staticcheck@latest + command -v staticcheck || go install honnef.co/go/tools/cmd/staticcheck@latest clean: rm -rf ./dist @@ -71,4 +71,4 @@ test: bootstrap-tools go test -test.short -json ./... > test.json echo "Running shellcheck" find . -name '*.sh' | xargs -n1 $(HACKDIR)/shellcheck - # Need to add staticcheck to replace golint as golint is deprecated, and staticcheck is the recommendation + staticcheck ./... From d5e1b6e4b463a1916e46a2ca4cbbe234a7753132 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Fri, 1 Nov 2024 10:26:07 +0100 Subject: [PATCH 08/22] Fix discrepency metrics <> rebootAsRequired Without this patch, one metric could say "reboot is required" while the rebootAsRequired tick did not run (long period for example). This is a problem, as it leads to misexpectations: "Why did the system not reboot, while the metrics indicate a reboot was required". This solves it by inlining the metrics management within the rebootAsRequired goroutine. Closes: #725 Signed-off-by: Jean-Philippe Evrard --- cmd/kured/main.go | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/cmd/kured/main.go b/cmd/kured/main.go index a6b1c0e82..5cc715f3d 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -283,7 +283,6 @@ func main() { lock := daemonsetlock.New(client, nodeID, dsNamespace, dsName, lockAnnotation, lockTTL, concurrency, lockReleaseDelay) go rebootAsRequired(nodeID, rebooter, rebootChecker, blockCheckers, window, lock, client) - go maintainRebootRequiredMetric(nodeID, rebootChecker) http.Handle("/metrics", promhttp.Handler()) log.Fatal(http.ListenAndServe(fmt.Sprintf("%s:%d", metricsHost, metricsPort), nil)) @@ -468,17 +467,6 @@ func uncordon(client *kubernetes.Clientset, node *v1.Node) error { return nil } -func maintainRebootRequiredMetric(nodeID string, checker checkers.Checker) { - for { - if checker.RebootRequired() { - rebootRequiredGauge.WithLabelValues(nodeID).Set(1) - } else { - rebootRequiredGauge.WithLabelValues(nodeID).Set(0) - } - time.Sleep(time.Minute) - } -} - func addNodeAnnotations(client *kubernetes.Clientset, nodeID string, annotations map[string]string) error { node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{}) if err != nil { @@ -610,7 +598,9 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. preferNoScheduleTaint := taints.New(client, nodeID, preferNoScheduleTaintName, v1.TaintEffectPreferNoSchedule) // Remove taint immediately during startup to quickly allow scheduling again. + // Also update the metric at startup (after lock shenanigans) if !checker.RebootRequired() { + rebootRequiredGauge.WithLabelValues(nodeID).Set(0) preferNoScheduleTaint.Disable() } @@ -625,9 +615,11 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. if !checker.RebootRequired() { log.Infof("Reboot not required") + rebootRequiredGauge.WithLabelValues(nodeID).Set(0) preferNoScheduleTaint.Disable() continue } + rebootRequiredGauge.WithLabelValues(nodeID).Set(1) node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{}) if err != nil { From 78628fd74e6eb4efcb622db98f8b8542e931431a Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Thu, 7 Nov 2024 23:19:30 +0100 Subject: [PATCH 09/22] Move implementations around No reason to have all the internal tools (timewindows, taints, locks) as public interfaces. Should someone be crazy to rebuild its own kured, then they would need to load our high level tools: the checkers/blockers/rebooters. Signed-off-by: Jean-Philippe Evrard --- cmd/kured/main.go | 15 ++- .../daemonsetlock/daemonsetlock.go | 0 .../daemonsetlock/daemonsetlock_test.go | 0 {pkg => internal}/delaytick/delaytick.go | 0 {pkg => internal}/taints/taints.go | 2 +- {pkg => internal}/timewindow/days.go | 0 {pkg => internal}/timewindow/days_test.go | 0 {pkg => internal}/timewindow/timewindow.go | 0 .../timewindow/timewindow_test.go | 0 internal/validators.go | 34 ------ pkg/checkers/checker.go | 106 ++---------------- pkg/checkers/commandchecker.go | 75 +++++++++++++ ...checker_test.go => commandchecker_test.go} | 0 pkg/checkers/filechecker.go | 28 +++++ pkg/reboot/reboot.go | 20 ++++ 15 files changed, 141 insertions(+), 139 deletions(-) rename {pkg => internal}/daemonsetlock/daemonsetlock.go (100%) rename {pkg => internal}/daemonsetlock/daemonsetlock_test.go (100%) rename {pkg => internal}/delaytick/delaytick.go (100%) rename {pkg => internal}/taints/taints.go (98%) rename {pkg => internal}/timewindow/days.go (100%) rename {pkg => internal}/timewindow/days_test.go (100%) rename {pkg => internal}/timewindow/timewindow.go (100%) rename {pkg => internal}/timewindow/timewindow_test.go (100%) create mode 100644 pkg/checkers/commandchecker.go rename pkg/checkers/{checker_test.go => commandchecker_test.go} (100%) create mode 100644 pkg/checkers/filechecker.go diff --git a/cmd/kured/main.go b/cmd/kured/main.go index 5cc715f3d..76615c15a 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -4,6 +4,10 @@ import ( "context" "encoding/json" "fmt" + "github.com/kubereboot/kured/internal/daemonsetlock" + "github.com/kubereboot/kured/internal/delaytick" + "github.com/kubereboot/kured/internal/taints" + "github.com/kubereboot/kured/internal/timewindow" "math/rand" "net/http" "net/url" @@ -15,14 +19,9 @@ import ( "time" "github.com/containrrr/shoutrrr" - "github.com/kubereboot/kured/internal" "github.com/kubereboot/kured/pkg/blockers" "github.com/kubereboot/kured/pkg/checkers" - "github.com/kubereboot/kured/pkg/daemonsetlock" - "github.com/kubereboot/kured/pkg/delaytick" "github.com/kubereboot/kured/pkg/reboot" - "github.com/kubereboot/kured/pkg/taints" - "github.com/kubereboot/kured/pkg/timewindow" papi "github.com/prometheus/client_golang/api" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" @@ -242,12 +241,12 @@ func main() { log.Infof("Reboot schedule: %v", window) log.Infof("Reboot method: %s", rebootMethod) - rebooter, err := internal.NewRebooter(rebootMethod, rebootCommand, rebootSignal) + rebooter, err := reboot.NewRebooter(rebootMethod, rebootCommand, rebootSignal) if err != nil { log.Fatalf("Failed to build rebooter: %v", err) } - rebootChecker, err := internal.NewRebootChecker(rebootSentinelCommand, rebootSentinelFile) + rebootChecker, err := checkers.NewRebootChecker(rebootSentinelCommand, rebootSentinelFile) if err != nil { log.Fatalf("Failed to build reboot checker: %v", err) } @@ -663,7 +662,7 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. } if !acquired { log.Warnf("Lock already held: %v", holder) - // Prefer to not schedule pods onto this node to avoid draing the same pod multiple times. + // Prefer to not schedule pods onto this node to avoid draining the same pod multiple times. preferNoScheduleTaint.Enable() continue } diff --git a/pkg/daemonsetlock/daemonsetlock.go b/internal/daemonsetlock/daemonsetlock.go similarity index 100% rename from pkg/daemonsetlock/daemonsetlock.go rename to internal/daemonsetlock/daemonsetlock.go diff --git a/pkg/daemonsetlock/daemonsetlock_test.go b/internal/daemonsetlock/daemonsetlock_test.go similarity index 100% rename from pkg/daemonsetlock/daemonsetlock_test.go rename to internal/daemonsetlock/daemonsetlock_test.go diff --git a/pkg/delaytick/delaytick.go b/internal/delaytick/delaytick.go similarity index 100% rename from pkg/delaytick/delaytick.go rename to internal/delaytick/delaytick.go diff --git a/pkg/taints/taints.go b/internal/taints/taints.go similarity index 98% rename from pkg/taints/taints.go rename to internal/taints/taints.go index a9592e90c..bd59e8d6a 100644 --- a/pkg/taints/taints.go +++ b/internal/taints/taints.go @@ -138,7 +138,7 @@ func preferNoSchedule(client *kubernetes.Clientset, nodeID, taintName string, ef }, } } else { - // add missing taint to exsting list + // add missing taint to existing list patches = []patchTaints{ { Op: "add", diff --git a/pkg/timewindow/days.go b/internal/timewindow/days.go similarity index 100% rename from pkg/timewindow/days.go rename to internal/timewindow/days.go diff --git a/pkg/timewindow/days_test.go b/internal/timewindow/days_test.go similarity index 100% rename from pkg/timewindow/days_test.go rename to internal/timewindow/days_test.go diff --git a/pkg/timewindow/timewindow.go b/internal/timewindow/timewindow.go similarity index 100% rename from pkg/timewindow/timewindow.go rename to internal/timewindow/timewindow.go diff --git a/pkg/timewindow/timewindow_test.go b/internal/timewindow/timewindow_test.go similarity index 100% rename from pkg/timewindow/timewindow_test.go rename to internal/timewindow/timewindow_test.go diff --git a/internal/validators.go b/internal/validators.go index b0e32381d..5bf0569ce 100644 --- a/internal/validators.go +++ b/internal/validators.go @@ -1,35 +1 @@ package internal - -import ( - "fmt" - "github.com/kubereboot/kured/pkg/checkers" - "github.com/kubereboot/kured/pkg/reboot" - log "github.com/sirupsen/logrus" -) - -// NewRebooter validates the rebootMethod, rebootCommand, and rebootSignal input, -// then chains to the right constructor. -func NewRebooter(rebootMethod string, rebootCommand string, rebootSignal int) (reboot.Rebooter, error) { - switch { - case rebootMethod == "command": - log.Infof("Reboot command: %s", rebootCommand) - return reboot.NewCommandRebooter(rebootCommand) - case rebootMethod == "signal": - log.Infof("Reboot signal: %d", rebootSignal) - return reboot.NewSignalRebooter(rebootSignal) - default: - return nil, fmt.Errorf("invalid reboot-method configured %s, expected signal or command", rebootMethod) - } -} - -// NewRebootChecker validates the rebootSentinelCommand, rebootSentinelFile input, -// then chains to the right constructor. -func NewRebootChecker(rebootSentinelCommand string, rebootSentinelFile string) (checkers.Checker, error) { - // An override of rebootSentinelCommand means a privileged command - if rebootSentinelCommand != "" { - log.Infof("Sentinel checker is (privileged) user provided command: %s", rebootSentinelCommand) - return checkers.NewCommandChecker(rebootSentinelCommand, 1, true) - } - log.Infof("Sentinel checker is (unprivileged) testing for the presence of: %s", rebootSentinelFile) - return checkers.NewFileRebootChecker(rebootSentinelFile) -} diff --git a/pkg/checkers/checker.go b/pkg/checkers/checker.go index a696b41fd..1693184e2 100644 --- a/pkg/checkers/checker.go +++ b/pkg/checkers/checker.go @@ -1,14 +1,6 @@ package checkers -import ( - "bytes" - "fmt" - "github.com/google/shlex" - log "github.com/sirupsen/logrus" - "os" - "os/exec" - "strings" -) +import "github.com/sirupsen/logrus" // Checker is the standard interface to use to check // if a reboot is required. Its types must implement a @@ -18,92 +10,14 @@ type Checker interface { RebootRequired() bool } -// FileRebootChecker is the default reboot checker. -// It is unprivileged, and tests the presence of a files -type FileRebootChecker struct { - FilePath string -} - -// RebootRequired checks the file presence -// needs refactoring to also return an error, instead of leaking it inside the code. -// This needs refactoring to get rid of NewCommand -// This needs refactoring to only contain file location, instead of CheckCommand -func (rc FileRebootChecker) RebootRequired() bool { - if _, err := os.Stat(rc.FilePath); err == nil { - return true - } - return false -} - -// NewFileRebootChecker is the constructor for the file based reboot checker -// TODO: Add extra input validation on filePath string here -func NewFileRebootChecker(filePath string) (*FileRebootChecker, error) { - return &FileRebootChecker{ - FilePath: filePath, - }, nil -} - -// CommandChecker is using a custom command to check -// if a reboot is required. There are two modes of behaviour, -// if Privileged is granted, the NamespacePid is used to nsenter -// the given PID's namespace. -type CommandChecker struct { - CheckCommand []string - NamespacePid int - Privileged bool -} - -// RebootRequired for CommandChecker runs a command without returning -// any eventual error. This should be later refactored to return the errors, -// instead of logging and fataling them here. -func (rc CommandChecker) RebootRequired() bool { - bufStdout := new(bytes.Buffer) - bufStderr := new(bytes.Buffer) - cmd := exec.Command(rc.CheckCommand[0], rc.CheckCommand[1:]...) - cmd.Stdout = bufStdout - cmd.Stderr = bufStderr - - if err := cmd.Run(); err != nil { - switch err := err.(type) { - case *exec.ExitError: - // We assume a non-zero exit code means 'reboot not required', but of course - // the user could have misconfigured the sentinel command or something else - // went wrong during its execution. In that case, not entering a reboot loop - // is the right thing to do, and we are logging stdout/stderr of the command - // so it should be obvious what is wrong. - if cmd.ProcessState.ExitCode() != 1 { - log.Warn(fmt.Sprintf("sentinel command ended with unexpected exit code: %v", cmd.ProcessState.ExitCode()), "cmd", strings.Join(cmd.Args, " "), "stdout", bufStdout.String(), "stderr", bufStderr.String()) - } - return false - default: - // Something was grossly misconfigured, such as the command path being wrong. - log.Fatal(fmt.Sprintf("Error invoking sentinel command: %v", err), "cmd", strings.Join(cmd.Args, " "), "stdout", bufStdout.String(), "stderr", bufStderr.String()) - } - } - log.Info("checking if reboot is required", "cmd", strings.Join(cmd.Args, " "), "stdout", bufStdout.String(), "stderr", bufStderr.String()) - return true -} - -// NewCommandChecker is the constructor for the commandChecker, and by default -// runs new commands in a privileged fashion. -// Privileged means wrapping the command with nsenter. -// It allows to run a command from systemd's namespace for example (pid 1) -// This relies on hostPID:true and privileged:true to enter host mount space -// For info, rancher based need different pid, which should be user given. -// until we have a better discovery mechanism. -func NewCommandChecker(sentinelCommand string, pid int, privileged bool) (*CommandChecker, error) { - var cmd []string - if privileged { - cmd = append(cmd, "/usr/bin/nsenter", fmt.Sprintf("-m/proc/%d/ns/mnt", pid), "--") - } - parsedCommand, err := shlex.Split(sentinelCommand) - if err != nil { - return nil, fmt.Errorf("error parsing provided sentinel command: %v", err) +// NewRebootChecker validates the rebootSentinelCommand, rebootSentinelFile input, +// then chains to the right constructor. +func NewRebootChecker(rebootSentinelCommand string, rebootSentinelFile string) (Checker, error) { + // An override of rebootSentinelCommand means a privileged command + if rebootSentinelCommand != "" { + logrus.Infof("Sentinel checker is (privileged) user provided command: %s", rebootSentinelCommand) + return NewCommandChecker(rebootSentinelCommand, 1, true) } - cmd = append(cmd, parsedCommand...) - return &CommandChecker{ - CheckCommand: cmd, - NamespacePid: pid, - Privileged: privileged, - }, nil + logrus.Infof("Sentinel checker is (unprivileged) testing for the presence of: %s", rebootSentinelFile) + return NewFileRebootChecker(rebootSentinelFile) } diff --git a/pkg/checkers/commandchecker.go b/pkg/checkers/commandchecker.go new file mode 100644 index 000000000..76185a0f6 --- /dev/null +++ b/pkg/checkers/commandchecker.go @@ -0,0 +1,75 @@ +package checkers + +import ( + "bytes" + "fmt" + "github.com/google/shlex" + log "github.com/sirupsen/logrus" + "os/exec" + "strings" +) + +// CommandChecker is using a custom command to check +// if a reboot is required. There are two modes of behaviour, +// if Privileged is granted, the NamespacePid is used to nsenter +// the given PID's namespace. +type CommandChecker struct { + CheckCommand []string + NamespacePid int + Privileged bool +} + +// RebootRequired for CommandChecker runs a command without returning +// any eventual error. This should be later refactored to return the errors, +// instead of logging and fataling them here. +func (rc CommandChecker) RebootRequired() bool { + bufStdout := new(bytes.Buffer) + bufStderr := new(bytes.Buffer) + cmd := exec.Command(rc.CheckCommand[0], rc.CheckCommand[1:]...) + cmd.Stdout = bufStdout + cmd.Stderr = bufStderr + + if err := cmd.Run(); err != nil { + switch err := err.(type) { + case *exec.ExitError: + // We assume a non-zero exit code means 'reboot not required', but of course + // the user could have misconfigured the sentinel command or something else + // went wrong during its execution. In that case, not entering a reboot loop + // is the right thing to do, and we are logging stdout/stderr of the command + // so it should be obvious what is wrong. + if cmd.ProcessState.ExitCode() != 1 { + log.Warn(fmt.Sprintf("sentinel command ended with unexpected exit code: %v", cmd.ProcessState.ExitCode()), "cmd", strings.Join(cmd.Args, " "), "stdout", bufStdout.String(), "stderr", bufStderr.String()) + } + return false + default: + // Something was grossly misconfigured, such as the command path being wrong. + log.Fatal(fmt.Sprintf("Error invoking sentinel command: %v", err), "cmd", strings.Join(cmd.Args, " "), "stdout", bufStdout.String(), "stderr", bufStderr.String()) + } + } + log.Info("checking if reboot is required", "cmd", strings.Join(cmd.Args, " "), "stdout", bufStdout.String(), "stderr", bufStderr.String()) + return true +} + +// NewCommandChecker is the constructor for the commandChecker, and by default +// runs new commands in a privileged fashion. +// Privileged means wrapping the command with nsenter. +// It allows to run a command from systemd's namespace for example (pid 1) +// This relies on hostPID:true and privileged:true to enter host mount space +// For info, rancher based need different pid, which should be user given. +// until we have a better discovery mechanism. +func NewCommandChecker(sentinelCommand string, pid int, privileged bool) (*CommandChecker, error) { + var cmd []string + if privileged { + cmd = append(cmd, "/usr/bin/nsenter", fmt.Sprintf("-m/proc/%d/ns/mnt", pid), "--") + } + parsedCommand, err := shlex.Split(sentinelCommand) + if err != nil { + return nil, fmt.Errorf("error parsing provided sentinel command: %v", err) + } + cmd = append(cmd, parsedCommand...) + return &CommandChecker{ + CheckCommand: cmd, + NamespacePid: pid, + Privileged: privileged, + }, nil +} diff --git a/pkg/checkers/checker_test.go b/pkg/checkers/commandchecker_test.go similarity index 100% rename from pkg/checkers/checker_test.go rename to pkg/checkers/commandchecker_test.go diff --git a/pkg/checkers/filechecker.go b/pkg/checkers/filechecker.go new file mode 100644 index 000000000..411cf0fac --- /dev/null +++ b/pkg/checkers/filechecker.go @@ -0,0 +1,28 @@ +package checkers + +import "os" + +// FileRebootChecker is the default reboot checker. +// It is unprivileged, and tests the presence of a files +type FileRebootChecker struct { + FilePath string +} + +// RebootRequired checks the file presence +// needs refactoring to also return an error, instead of leaking it inside the code. +// This needs refactoring to get rid of NewCommand +// This needs refactoring to only contain file location, instead of CheckCommand +func (rc FileRebootChecker) RebootRequired() bool { + if _, err := os.Stat(rc.FilePath); err == nil { + return true + } + return false +} + +// NewFileRebootChecker is the constructor for the file based reboot checker +// TODO: Add extra input validation on filePath string here +func NewFileRebootChecker(filePath string) (*FileRebootChecker, error) { + return &FileRebootChecker{ + FilePath: filePath, + }, nil +} diff --git a/pkg/reboot/reboot.go b/pkg/reboot/reboot.go index bfcf2036a..78eaa380f 100644 --- a/pkg/reboot/reboot.go +++ b/pkg/reboot/reboot.go @@ -1,5 +1,10 @@ package reboot +import ( + "fmt" + "github.com/sirupsen/logrus" +) + // Rebooter is the standard interface to use to execute // the reboot, after it has been considered as necessary. // The Reboot method does not expect any return, yet should @@ -7,3 +12,18 @@ package reboot type Rebooter interface { Reboot() error } + +// NewRebooter validates the rebootMethod, rebootCommand, and rebootSignal input, +// then chains to the right constructor. +func NewRebooter(rebootMethod string, rebootCommand string, rebootSignal int) (Rebooter, error) { + switch { + case rebootMethod == "command": + logrus.Infof("Reboot command: %s", rebootCommand) + return NewCommandRebooter(rebootCommand) + case rebootMethod == "signal": + logrus.Infof("Reboot signal: %d", rebootSignal) + return NewSignalRebooter(rebootSignal) + default: + return nil, fmt.Errorf("invalid reboot-method configured %s, expected signal or command", rebootMethod) + } +} From f591de4aff59d90ade3e80f45b5b2958db0550f4 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Sat, 12 Oct 2024 11:38:48 +0200 Subject: [PATCH 10/22] Cleanup rebootAsRequired This patches simplifies the principal loop: rebootAsRequired - Previous patch "Fix discrepency metrics <> rebootAsRequired" changed the behaviour of the metrics, ensuring the metrics and the checks for reboots are aligned. However, this meant that the metrics would not be correct for long periods of time if the period is high. This patches therefore clarifies the ticking purpose to avoid users to set large values, as we have seen in multiple questions in the past (ppl were missing that part of the documentation, and could find the name "period" a misnomer). - With the ticking clarified, it was the opportunity to remove the randomization bit. It was very unlikely, even on large clusters, that calls would trigger a storm on the API server. Similarly, the delay was not bringing any value. By removing those two items, the code can be simplified futher by removing the delaytick and its random seed. - However, by ticking more frequently, we now have more calls on the API server, which might have an impact on large clusters. In order to fix that, I reschuffled the loop to quickly know if a reboot is required at each tick, which helps a bit reducing some of the calls. This can be further improved by sharing node/ds info through the calls in the tick in order to make the process more efficient. - A new struct GenericRebooter was introduced to keep data all the rebooters need. For example, here, it keeps the rebootDelay imposed to the reboot function. Each rebooter can implement its own reboot method, but to simplify there the generic rebooter implements a DelayReboot() method, which is then called from all the Reboot methods of the concrete types. - On the way, this commit also changed the RebootBlocked function, to also return the list of the names of (types) blocking the reboot. This is useful to debug, but also can be useful to later expose the content of the blocker into specific metrics. Signed-off-by: Jean-Philippe Evrard --- cmd/kured/main.go | 282 +++++++++++++++++----------------- pkg/blockers/blockers.go | 9 +- pkg/blockers/blockers_test.go | 2 +- pkg/reboot/command.go | 12 +- pkg/reboot/reboot.go | 18 ++- pkg/reboot/signal.go | 15 +- 6 files changed, 189 insertions(+), 149 deletions(-) diff --git a/cmd/kured/main.go b/cmd/kured/main.go index 76615c15a..af5302f1d 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -4,21 +4,10 @@ import ( "context" "encoding/json" "fmt" + "github.com/containrrr/shoutrrr" "github.com/kubereboot/kured/internal/daemonsetlock" - "github.com/kubereboot/kured/internal/delaytick" "github.com/kubereboot/kured/internal/taints" "github.com/kubereboot/kured/internal/timewindow" - "math/rand" - "net/http" - "net/url" - "os" - "reflect" - "sort" - "strconv" - "strings" - "time" - - "github.com/containrrr/shoutrrr" "github.com/kubereboot/kured/pkg/blockers" "github.com/kubereboot/kured/pkg/checkers" "github.com/kubereboot/kured/pkg/reboot" @@ -33,6 +22,14 @@ import ( "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" kubectldrain "k8s.io/kubectl/pkg/drain" + "net/http" + "net/url" + "os" + "reflect" + "sort" + "strconv" + "strings" + "time" ) var ( @@ -133,8 +130,8 @@ func main() { "delay reboot for this duration (default: 0, disabled)") flag.StringVar(&rebootMethod, "reboot-method", "command", "method to use for reboots. Available: command") - flag.DurationVar(&period, "period", time.Minute*60, - "sentinel check period") + flag.DurationVar(&period, "period", time.Minute, + "period at which the main operations are done") flag.StringVar(&dsNamespace, "ds-namespace", "kube-system", "namespace containing daemonset on which to place lock") flag.StringVar(&dsName, "ds-name", "kured", @@ -241,7 +238,8 @@ func main() { log.Infof("Reboot schedule: %v", window) log.Infof("Reboot method: %s", rebootMethod) - rebooter, err := reboot.NewRebooter(rebootMethod, rebootCommand, rebootSignal) + + rebooter, err := reboot.NewRebooter(rebootMethod, rebootCommand, rebootSignal, rebootDelay) if err != nil { log.Fatalf("Failed to build rebooter: %v", err) } @@ -281,7 +279,7 @@ func main() { } lock := daemonsetlock.New(client, nodeID, dsNamespace, dsName, lockAnnotation, lockTTL, concurrency, lockReleaseDelay) - go rebootAsRequired(nodeID, rebooter, rebootChecker, blockCheckers, window, lock, client) + go rebootAsRequired(nodeID, rebooter, rebootChecker, blockCheckers, window, lock, client, period) http.Handle("/metrics", promhttp.Handler()) log.Fatal(http.ListenAndServe(fmt.Sprintf("%s:%d", metricsHost, metricsPort), nil)) @@ -540,41 +538,79 @@ func updateNodeLabels(client *kubernetes.Clientset, node *v1.Node, labels []stri } } -func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers.Checker, blockCheckers []blockers.RebootBlocker, window *timewindow.TimeWindow, lock daemonsetlock.Lock, client *kubernetes.Clientset) { +func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers.Checker, blockCheckers []blockers.RebootBlocker, window *timewindow.TimeWindow, lock daemonsetlock.Lock, client *kubernetes.Clientset, period time.Duration) { - source := rand.NewSource(time.Now().UnixNano()) - tick := delaytick.New(source, 1*time.Minute) - for range tick { - holding, lockData, err := lock.Holding() - if err != nil { - log.Errorf("Error testing lock: %v", err) - } - if holding { + preferNoScheduleTaint := taints.New(client, nodeID, preferNoScheduleTaintName, v1.TaintEffectPreferNoSchedule) + + // No reason to delay the first ticks. + // On top of it, we used to leak a goroutine, which was never garbage collected. + // Starting on go1.23, with Tick, we would have that goroutine garbage collected. + c := time.Tick(period) + for range c { + + rebootRequired := checker.RebootRequired() + if !rebootRequired { + rebootRequiredGauge.WithLabelValues(nodeID).Set(0) + // Now cleaning up post reboot + + // Quickly allow rescheduling. + // The node could be still cordonned anyway + preferNoScheduleTaint.Disable() + + // Test API server first. If cannot get node, we should not do anything. node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{}) if err != nil { - log.Errorf("Error retrieving node object via k8s API: %v", err) + log.Infof("Error retrieving node object via k8s API: %v", err) continue } - if !lockData.Metadata.Unschedulable { - err = uncordon(client, node) - if err != nil { - log.Errorf("Unable to uncordon %s: %v, will continue to hold lock and retry uncordon", node.GetName(), err) - continue - } else { + // Get lock data to know if need to uncordon the node + // to get the node back to its previous state + // TODO: Need to move to another method to check the current data of the lock relevant for this node + holding, lockData, err := lock.Holding() + if err != nil { + log.Infof("Error checking lock - Not applying any action: %v", err) + continue + } + + // we check if holding ONLY to know if lockData is valid. + // When moving to fetch lockData as a separate method, remove + // this whole condition. + // However, it means that Release() + // need to behave idempotently regardless or not the lock is + // held, but that's an ideal state. + // what we should simply do is reconcile the lock data + // with the node spec. But behind the scenes its a bug + // if it's not holding due to an error + if holding && !lockData.Metadata.Unschedulable { + // Split into two lines to remember I need to remove the first + // condition ;) + if node.Spec.Unschedulable != lockData.Metadata.Unschedulable && lockData.Metadata.Unschedulable == false { + err = uncordon(client, node) + if err != nil { + log.Infof("Unable to uncordon %s: %v, will continue to hold lock and retry uncordon", node.GetName(), err) + continue + } + // TODO, modify the actions to directly log and notify, instead of individual methods giving + // an incomplete view of the lifecycle if notifyURL != "" { if err := shoutrrr.Send(notifyURL, fmt.Sprintf(messageTemplateUncordon, nodeID)); err != nil { log.Warnf("Error notifying: %v", err) } } } + + } + + // Releasing lock earlier is nice for other nodes + err = lock.Release() + if err != nil { + log.Infof("Error releasing lock, will retry: %v", err) + continue } - // If we're holding the lock we know we've tried, in a prior run, to reboot - // So (1) we want to confirm that the reboot succeeded practically ( !rebootRequired() ) - // And (2) check if we previously annotated the node that it was in the process of being rebooted, - // And finally (3) if it has that annotation, to delete it. - // This indicates to other node tools running on the cluster that this node may be a candidate for maintenance - if annotateNodes && !checker.RebootRequired() { + // Regardless or not we are holding the lock + // The node should not say it's still in progress if the reboot is done + if annotateNodes { if _, ok := node.Annotations[KuredRebootInProgressAnnotation]; ok { err := deleteNodeAnnotation(client, nodeID, KuredRebootInProgressAnnotation) if err != nil { @@ -583,124 +619,94 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. } } - err = lock.Release() - if err != nil { - log.Errorf("Error releasing lock, will retry: %v", err) - continue - } - break } else { - break - } - } + rebootRequiredGauge.WithLabelValues(nodeID).Set(1) - preferNoScheduleTaint := taints.New(client, nodeID, preferNoScheduleTaintName, v1.TaintEffectPreferNoSchedule) - - // Remove taint immediately during startup to quickly allow scheduling again. - // Also update the metric at startup (after lock shenanigans) - if !checker.RebootRequired() { - rebootRequiredGauge.WithLabelValues(nodeID).Set(0) - preferNoScheduleTaint.Disable() - } + // Act on reboot required. + if !window.Contains(time.Now()) { + log.Debugf("Reboot required for node %v, but outside maintenance window", nodeID) + continue + } + // moved up, because we should not put an annotation "Going to be rebooting", if + // we know well that this won't reboot. TBD as some ppl might have another opinion. + if blocked, names := blockers.RebootBlocked(blockCheckers...); blocked { + log.Infof("Reboot blocked by %v", strings.Join(names, ", ")) + continue + } - source = rand.NewSource(time.Now().UnixNano()) - tick = delaytick.New(source, period) - for range tick { - if !window.Contains(time.Now()) { - // Remove taint outside the reboot time window to allow for normal operation. - preferNoScheduleTaint.Disable() - continue - } + // Test API server first. If cannot get node, we should not do anything. + node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{}) + if err != nil { + log.Debugf("Error retrieving node object via k8s API: %v", err) + continue + } + // nodeMeta contains the node Metadata that should be included in the lock + nodeMeta := daemonsetlock.NodeMeta{Unschedulable: node.Spec.Unschedulable} + + var timeNowString string + if annotateNodes { + if _, ok := node.Annotations[KuredRebootInProgressAnnotation]; !ok { + timeNowString = time.Now().Format(time.RFC3339) + // Annotate this node to indicate that "I am going to be rebooted!" + // so that other node maintenance tools running on the cluster are aware that this node is in the process of a "state transition" + annotations := map[string]string{KuredRebootInProgressAnnotation: timeNowString} + // & annotate this node with a timestamp so that other node maintenance tools know how long it's been since this node has been marked for reboot + annotations[KuredMostRecentRebootNeededAnnotation] = timeNowString + err := addNodeAnnotations(client, nodeID, annotations) + if err != nil { + continue + } + } + } - if !checker.RebootRequired() { - log.Infof("Reboot not required") - rebootRequiredGauge.WithLabelValues(nodeID).Set(0) - preferNoScheduleTaint.Disable() - continue - } - rebootRequiredGauge.WithLabelValues(nodeID).Set(1) + // Prefer to not schedule pods onto this node to avoid draing the same pod multiple times. + preferNoScheduleTaint.Enable() - node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{}) - if err != nil { - log.Fatalf("Error retrieving node object via k8s API: %v", err) - } + // This could be merged into a single idempotent "Acquire" lock + holding, _, err := lock.Holding() + if err != nil { + log.Debugf("Error testing lock: %v", err) + continue + } - nodeMeta := daemonsetlock.NodeMeta{Unschedulable: node.Spec.Unschedulable} - - var timeNowString string - if annotateNodes { - if _, ok := node.Annotations[KuredRebootInProgressAnnotation]; !ok { - timeNowString = time.Now().Format(time.RFC3339) - // Annotate this node to indicate that "I am going to be rebooted!" - // so that other node maintenance tools running on the cluster are aware that this node is in the process of a "state transition" - annotations := map[string]string{KuredRebootInProgressAnnotation: timeNowString} - // & annotate this node with a timestamp so that other node maintenance tools know how long it's been since this node has been marked for reboot - annotations[KuredMostRecentRebootNeededAnnotation] = timeNowString - err := addNodeAnnotations(client, nodeID, annotations) + if !holding { + acquired, holder, err := lock.Acquire(nodeMeta) if err != nil { + log.Debugf("Error acquiring lock: %v", err) + continue + } + if !acquired { + log.Infof("Lock already held by %v, will retry", holder) continue } } - } - var rebootRequiredBlockCondition string - if blockers.RebootBlocked(blockCheckers...) { - rebootRequiredBlockCondition = ", but blocked at this time" - continue - } - log.Infof("Reboot required%s", rebootRequiredBlockCondition) - - holding, _, err := lock.Holding() - if err != nil { - log.Errorf("Error testing lock: %v", err) - } - - if !holding { - acquired, holder, err := lock.Acquire(nodeMeta) + err = drain(client, node) if err != nil { - log.Errorf("Error acquiring lock: %v", err) - } - if !acquired { - log.Warnf("Lock already held: %v", holder) - // Prefer to not schedule pods onto this node to avoid draining the same pod multiple times. - preferNoScheduleTaint.Enable() - continue + if !forceReboot { + log.Infof("Unable to cordon or drain %s: %v, will force-reboot by releasing lock and uncordon until next success", node.GetName(), err) + err = lock.Release() + if err != nil { + log.Infof("Error in best-effort releasing lock: %v", err) + } + log.Infof("Performing a best-effort uncordon after failed cordon and drain") + uncordon(client, node) + continue + } } - } - - err = drain(client, node) - if err != nil { - if !forceReboot { - log.Errorf("Unable to cordon or drain %s: %v, will release lock and retry cordon and drain before rebooting when lock is next acquired", node.GetName(), err) - err = lock.Release() - if err != nil { - log.Errorf("Error releasing lock: %v", err) + if notifyURL != "" { + if err := shoutrrr.Send(notifyURL, fmt.Sprintf(messageTemplateReboot, nodeID)); err != nil { + log.Infof("Error notifying: %v", err) } - log.Infof("Performing a best-effort uncordon after failed cordon and drain") - uncordon(client, node) - continue } - } - if rebootDelay > 0 { - log.Infof("Delaying reboot for %v", rebootDelay) - time.Sleep(rebootDelay) - } + log.Infof("Triggering reboot for node %v", nodeID) - if notifyURL != "" { - if err := shoutrrr.Send(notifyURL, fmt.Sprintf(messageTemplateReboot, nodeID)); err != nil { - log.Warnf("Error notifying: %v", err) + rebooter.Reboot() + for { + log.Infof("Waiting for reboot") + time.Sleep(time.Minute) } } - log.Infof("Triggering reboot for node %v", nodeID) - - err = rebooter.Reboot() - if err != nil { - log.Fatalf("Unable to reboot node: %v", err) - } - for { - log.Infof("Waiting for reboot") - time.Sleep(time.Minute) - } } } diff --git a/pkg/blockers/blockers.go b/pkg/blockers/blockers.go index 04383103b..5832c6b5c 100644 --- a/pkg/blockers/blockers.go +++ b/pkg/blockers/blockers.go @@ -1,14 +1,17 @@ package blockers +import "fmt" + // RebootBlocked checks that a single block Checker // will block the reboot or not. -func RebootBlocked(blockers ...RebootBlocker) bool { +func RebootBlocked(blockers ...RebootBlocker) (blocked bool, blockernames []string) { for _, blocker := range blockers { if blocker.IsBlocked() { - return true + blocked = true + blockernames = append(blockernames, fmt.Sprintf("%T", blocker)) } } - return false + return } // RebootBlocker interface should be implemented by types diff --git a/pkg/blockers/blockers_test.go b/pkg/blockers/blockers_test.go index 653e2281e..8b60c2b51 100644 --- a/pkg/blockers/blockers_test.go +++ b/pkg/blockers/blockers_test.go @@ -57,7 +57,7 @@ func Test_rebootBlocked(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - if got := RebootBlocked(tt.args.blockers...); got != tt.want { + if got, _ := RebootBlocked(tt.args.blockers...); got != tt.want { t.Errorf("rebootBlocked() = %v, want %v", got, tt.want) } }) diff --git a/pkg/reboot/command.go b/pkg/reboot/command.go index 869d8829c..88b7d577d 100644 --- a/pkg/reboot/command.go +++ b/pkg/reboot/command.go @@ -7,15 +7,18 @@ import ( log "github.com/sirupsen/logrus" "os/exec" "strings" + "time" ) // CommandRebooter holds context-information for a reboot with command type CommandRebooter struct { RebootCommand []string + GenericRebooter } // Reboot triggers the reboot command func (c CommandRebooter) Reboot() error { + c.DelayReboot() log.Infof("Invoking command: %s", c.RebootCommand) bufStdout := new(bytes.Buffer) @@ -34,7 +37,7 @@ func (c CommandRebooter) Reboot() error { // NewCommandRebooter is the constructor to create a CommandRebooter from a string not // yet shell lexed. You can skip this constructor if you parse the data correctly first // when instantiating a CommandRebooter instance. -func NewCommandRebooter(rebootCommand string) (*CommandRebooter, error) { +func NewCommandRebooter(rebootCommand string, rebootDelay time.Duration) (*CommandRebooter, error) { if rebootCommand == "" { return nil, fmt.Errorf("no reboot command specified") } @@ -44,5 +47,10 @@ func NewCommandRebooter(rebootCommand string) (*CommandRebooter, error) { return nil, fmt.Errorf("error %v when parsing reboot command %s", err, rebootCommand) } cmd = append(cmd, parsedCommand...) - return &CommandRebooter{RebootCommand: cmd}, nil + return &CommandRebooter{ + RebootCommand: cmd, + GenericRebooter: GenericRebooter{ + RebootDelay: rebootDelay, + }, + }, nil } diff --git a/pkg/reboot/reboot.go b/pkg/reboot/reboot.go index 78eaa380f..663a3fe8a 100644 --- a/pkg/reboot/reboot.go +++ b/pkg/reboot/reboot.go @@ -3,6 +3,7 @@ package reboot import ( "fmt" "github.com/sirupsen/logrus" + "time" ) // Rebooter is the standard interface to use to execute @@ -15,15 +16,26 @@ type Rebooter interface { // NewRebooter validates the rebootMethod, rebootCommand, and rebootSignal input, // then chains to the right constructor. -func NewRebooter(rebootMethod string, rebootCommand string, rebootSignal int) (Rebooter, error) { +func NewRebooter(rebootMethod string, rebootCommand string, rebootSignal int, rebootDelay time.Duration) (Rebooter, error) { switch { case rebootMethod == "command": logrus.Infof("Reboot command: %s", rebootCommand) - return NewCommandRebooter(rebootCommand) + return NewCommandRebooter(rebootCommand, rebootDelay) case rebootMethod == "signal": logrus.Infof("Reboot signal: %d", rebootSignal) - return NewSignalRebooter(rebootSignal) + return NewSignalRebooter(rebootSignal, rebootDelay) default: return nil, fmt.Errorf("invalid reboot-method configured %s, expected signal or command", rebootMethod) } } + +type GenericRebooter struct { + RebootDelay time.Duration +} + +func (g GenericRebooter) DelayReboot() { + if g.RebootDelay > 0 { + logrus.Infof("Delayed reboot for %s", g.RebootDelay) + time.Sleep(g.RebootDelay) + } +} diff --git a/pkg/reboot/signal.go b/pkg/reboot/signal.go index 92b7e0418..75d357497 100644 --- a/pkg/reboot/signal.go +++ b/pkg/reboot/signal.go @@ -2,17 +2,23 @@ package reboot import ( "fmt" + log "github.com/sirupsen/logrus" "os" "syscall" + "time" ) // SignalRebooter holds context-information for a signal reboot. type SignalRebooter struct { Signal int + GenericRebooter } // Reboot triggers the reboot signal func (c SignalRebooter) Reboot() error { + c.DelayReboot() + log.Infof("Invoking signal: %v", c.Signal) + process, err := os.FindProcess(1) if err != nil { return fmt.Errorf("not running on Unix: %v", err) @@ -29,9 +35,14 @@ func (c SignalRebooter) Reboot() error { // NewSignalRebooter is the constructor which sets the signal number. // The constructor does not yet validate any input. It should be done in a later commit. -func NewSignalRebooter(sig int) (*SignalRebooter, error) { +func NewSignalRebooter(sig int, rebootDelay time.Duration) (*SignalRebooter, error) { if sig < 1 { return nil, fmt.Errorf("invalid signal: %v", sig) } - return &SignalRebooter{Signal: sig}, nil + return &SignalRebooter{ + Signal: sig, + GenericRebooter: GenericRebooter{ + RebootDelay: rebootDelay, + }, + }, nil } From 1381c849a824d87bf247b56f6584a6886f40c811 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Fri, 8 Nov 2024 19:51:33 +0100 Subject: [PATCH 11/22] Add counter for blockers Without this, there is no way to tell, by looking at the metrics how much reboot attempts were done, or why they were blocked. This is a problem, as only logs are currently usable to expose the blockers. This fixes it by introducing a new vector holding as labels the node names and the reason of the block. For this, all the blockers have to implement another method in the RebootBlocker interface, namely MetricLabel(), to generate a string containing the block reason. To avoid high cardinality, this is basically a static name based on the struct. I did not choose to use a Stringer interface, as I feel the Stringer interface would make more sense for the blocker data itself (including its specific details, for example pod selector string, etc.). Signed-off-by: Jean-Philippe Evrard --- cmd/kured/main.go | 16 ++++++++++++---- pkg/blockers/blockers.go | 13 +++++++------ pkg/blockers/blockers_test.go | 4 ++++ pkg/blockers/kubernetespod.go | 4 ++++ pkg/blockers/prometheus.go | 2 +- 5 files changed, 28 insertions(+), 11 deletions(-) diff --git a/cmd/kured/main.go b/cmd/kured/main.go index af5302f1d..6fe50cfb7 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -87,6 +87,11 @@ var ( Name: "reboot_required", Help: "OS requires reboot due to software updates.", }, []string{"node"}) + rebootBlockedCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ + Subsystem: "kured", + Name: "reboot_blocked_reason", + Help: "Reboot required was blocked by event.", + }, []string{"node", "reason"}) ) const ( @@ -103,7 +108,7 @@ const ( ) func init() { - prometheus.MustRegister(rebootRequiredGauge) + prometheus.MustRegister(rebootRequiredGauge, rebootBlockedCounter) } func main() { @@ -547,7 +552,6 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. // Starting on go1.23, with Tick, we would have that goroutine garbage collected. c := time.Tick(period) for range c { - rebootRequired := checker.RebootRequired() if !rebootRequired { rebootRequiredGauge.WithLabelValues(nodeID).Set(0) @@ -629,8 +633,12 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. } // moved up, because we should not put an annotation "Going to be rebooting", if // we know well that this won't reboot. TBD as some ppl might have another opinion. - if blocked, names := blockers.RebootBlocked(blockCheckers...); blocked { - log.Infof("Reboot blocked by %v", strings.Join(names, ", ")) + + if blocked, currentlyBlocking := blockers.RebootBlocked(blockCheckers...); blocked { + for _, blocker := range currentlyBlocking { + rebootBlockedCounter.WithLabelValues(nodeID, blocker.MetricLabel()).Inc() + log.Infof("Reboot blocked by %v", blocker.MetricLabel()) + } continue } diff --git a/pkg/blockers/blockers.go b/pkg/blockers/blockers.go index 5832c6b5c..a832c29e0 100644 --- a/pkg/blockers/blockers.go +++ b/pkg/blockers/blockers.go @@ -1,14 +1,14 @@ package blockers -import "fmt" - -// RebootBlocked checks that a single block Checker -// will block the reboot or not. -func RebootBlocked(blockers ...RebootBlocker) (blocked bool, blockernames []string) { +// RebootBlocked returns whether at least one blocker +// amongst all the blockers IS CURRENTLY blocking the reboot +// and also returns the subset of those blockers CURRENTLY BLOCKING +// the reboot. +func RebootBlocked(blockers ...RebootBlocker) (blocked bool, blocking []RebootBlocker) { for _, blocker := range blockers { if blocker.IsBlocked() { blocked = true - blockernames = append(blockernames, fmt.Sprintf("%T", blocker)) + blocking = append(blocking, blocker) } } return @@ -18,4 +18,5 @@ func RebootBlocked(blockers ...RebootBlocker) (blocked bool, blockernames []stri // to know if their instantiations should block a reboot type RebootBlocker interface { IsBlocked() bool + MetricLabel() string } diff --git a/pkg/blockers/blockers_test.go b/pkg/blockers/blockers_test.go index 8b60c2b51..3f55cf663 100644 --- a/pkg/blockers/blockers_test.go +++ b/pkg/blockers/blockers_test.go @@ -13,6 +13,10 @@ func (fbc BlockingChecker) IsBlocked() bool { return fbc.blocking } +func (fbc BlockingChecker) MetricLabel() string { + return "fake_blocker" +} + func Test_rebootBlocked(t *testing.T) { noCheckers := []RebootBlocker{} nonblockingChecker := BlockingChecker{blocking: false} diff --git a/pkg/blockers/kubernetespod.go b/pkg/blockers/kubernetespod.go index ebd034097..21f38c6ec 100644 --- a/pkg/blockers/kubernetespod.go +++ b/pkg/blockers/kubernetespod.go @@ -59,3 +59,7 @@ func (kb KubernetesBlockingChecker) IsBlocked() bool { } return false } + +func (kb KubernetesBlockingChecker) MetricLabel() string { + return "blocker_pod" +} diff --git a/pkg/blockers/prometheus.go b/pkg/blockers/prometheus.go index 05c1bf136..9c386f476 100644 --- a/pkg/blockers/prometheus.go +++ b/pkg/blockers/prometheus.go @@ -68,7 +68,7 @@ func (pb PrometheusBlockingChecker) IsBlocked() bool { // MetricLabel is used to give a fancier name // than the type to the label for rebootBlockedCounter func (pb PrometheusBlockingChecker) MetricLabel() string { - return "prometheus" + return "prometheus_alert" } // ActiveAlerts is a method of type promClient, it returns a list of names of active alerts From 8cad29a38b001a2098c1e6c0e796aa3ac2193072 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Fri, 8 Nov 2024 20:56:05 +0100 Subject: [PATCH 12/22] Expose unpriv commands Wthout this, it will not be possible to use unprivileged commands. This is a problem, as one might want to run a command without the nsenter to pid 1. This fixes it by exposing this to main, the only thing remaining is to use a boolean to expose the feature in a later commit. Signed-off-by: Jean-Philippe Evrard --- cmd/kured/main.go | 2 +- pkg/reboot/command.go | 9 +++++++-- pkg/reboot/command_test.go | 2 +- pkg/reboot/reboot.go | 4 ++-- 4 files changed, 11 insertions(+), 6 deletions(-) diff --git a/cmd/kured/main.go b/cmd/kured/main.go index 6fe50cfb7..8cb5f37d5 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -244,7 +244,7 @@ func main() { log.Infof("Reboot method: %s", rebootMethod) - rebooter, err := reboot.NewRebooter(rebootMethod, rebootCommand, rebootSignal, rebootDelay) + rebooter, err := reboot.NewRebooter(rebootMethod, rebootCommand, rebootSignal, rebootDelay, true, 1) if err != nil { log.Fatalf("Failed to build rebooter: %v", err) } diff --git a/pkg/reboot/command.go b/pkg/reboot/command.go index 88b7d577d..3bde3bc01 100644 --- a/pkg/reboot/command.go +++ b/pkg/reboot/command.go @@ -37,15 +37,20 @@ func (c CommandRebooter) Reboot() error { // NewCommandRebooter is the constructor to create a CommandRebooter from a string not // yet shell lexed. You can skip this constructor if you parse the data correctly first // when instantiating a CommandRebooter instance. -func NewCommandRebooter(rebootCommand string, rebootDelay time.Duration) (*CommandRebooter, error) { +func NewCommandRebooter(rebootCommand string, rebootDelay time.Duration, privileged bool, pid int) (*CommandRebooter, error) { if rebootCommand == "" { return nil, fmt.Errorf("no reboot command specified") } - cmd := []string{"/usr/bin/nsenter", fmt.Sprintf("-m/proc/%d/ns/mnt", 1), "--"} + cmd := []string{} + if privileged && pid > 0 { + cmd = append(cmd, "/usr/bin/nsenter", fmt.Sprintf("-m/proc/%d/ns/mnt", pid), "--") + } + parsedCommand, err := shlex.Split(rebootCommand) if err != nil { return nil, fmt.Errorf("error %v when parsing reboot command %s", err, rebootCommand) } + cmd = append(cmd, parsedCommand...) return &CommandRebooter{ RebootCommand: cmd, diff --git a/pkg/reboot/command_test.go b/pkg/reboot/command_test.go index f1bdc270b..bd9eb2b50 100644 --- a/pkg/reboot/command_test.go +++ b/pkg/reboot/command_test.go @@ -30,7 +30,7 @@ func TestNewCommandRebooter(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got, err := NewCommandRebooter(tt.args.rebootCommand) + got, err := NewCommandRebooter(tt.args.rebootCommand, 0, true, 1) if (err != nil) != tt.wantErr { t.Errorf("NewCommandRebooter() error = %v, wantErr %v", err, tt.wantErr) return diff --git a/pkg/reboot/reboot.go b/pkg/reboot/reboot.go index 663a3fe8a..355ce702d 100644 --- a/pkg/reboot/reboot.go +++ b/pkg/reboot/reboot.go @@ -16,11 +16,11 @@ type Rebooter interface { // NewRebooter validates the rebootMethod, rebootCommand, and rebootSignal input, // then chains to the right constructor. -func NewRebooter(rebootMethod string, rebootCommand string, rebootSignal int, rebootDelay time.Duration) (Rebooter, error) { +func NewRebooter(rebootMethod string, rebootCommand string, rebootSignal int, rebootDelay time.Duration, privileged bool, pid int) (Rebooter, error) { switch { case rebootMethod == "command": logrus.Infof("Reboot command: %s", rebootCommand) - return NewCommandRebooter(rebootCommand, rebootDelay) + return NewCommandRebooter(rebootCommand, rebootDelay, true, 1) case rebootMethod == "signal": logrus.Infof("Reboot signal: %d", rebootSignal) return NewSignalRebooter(rebootSignal, rebootDelay) From f8feca26d1e7fe743b4f1fd6befd7bfcd082eee1 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Fri, 8 Nov 2024 20:59:20 +0100 Subject: [PATCH 13/22] Remove unnecessary functions delaytick is not used anymore. Signed-off-by: Jean-Philippe Evrard --- internal/delaytick/delaytick.go | 22 ---------------------- internal/validators.go | 1 - 2 files changed, 23 deletions(-) delete mode 100644 internal/delaytick/delaytick.go delete mode 100644 internal/validators.go diff --git a/internal/delaytick/delaytick.go b/internal/delaytick/delaytick.go deleted file mode 100644 index 880b64e81..000000000 --- a/internal/delaytick/delaytick.go +++ /dev/null @@ -1,22 +0,0 @@ -package delaytick - -import ( - "math/rand" - "time" -) - -// New ticks regularly after an initial delay randomly distributed between d/2 and d + d/2 -func New(s rand.Source, d time.Duration) <-chan time.Time { - c := make(chan time.Time) - - go func() { - random := rand.New(s) - time.Sleep(time.Duration(float64(d)/2 + float64(d)*random.Float64())) - c <- time.Now() - for t := range time.Tick(d) { - c <- t - } - }() - - return c -} diff --git a/internal/validators.go b/internal/validators.go deleted file mode 100644 index 5bf0569ce..000000000 --- a/internal/validators.go +++ /dev/null @@ -1 +0,0 @@ -package internal From 4b7ebd80d64cfce46116bd37cb5d2157357365d7 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Sat, 9 Nov 2024 19:30:37 +0100 Subject: [PATCH 14/22] Move to slog Without this, we are using logrus and the logs are inconsistent. This fixes it by moving to slog, and reviewing all the logs. I added multiple comments to clarify the intent behind the logs. With structured logging, extra data, such as the node, is exposed whenever possible, which makes the logging more consistent. We are not yet using contextes for ticks, it should be done in a later commit. Signed-off-by: Jean-Philippe Evrard --- cmd/kured/main.go | 231 +++++++++++++++--------- go.mod | 1 - go.sum | 2 - internal/daemonsetlock/daemonsetlock.go | 6 +- internal/taints/taints.go | 24 ++- pkg/blockers/kubernetespod.go | 6 +- pkg/blockers/prometheus.go | 6 +- pkg/checkers/checker.go | 9 +- pkg/checkers/commandchecker.go | 13 +- pkg/checkers/commandchecker_test.go | 10 +- pkg/reboot/command.go | 6 +- pkg/reboot/reboot.go | 8 +- pkg/reboot/signal.go | 4 +- 13 files changed, 199 insertions(+), 127 deletions(-) diff --git a/cmd/kured/main.go b/cmd/kured/main.go index 8cb5f37d5..079eab213 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -14,7 +14,6 @@ import ( papi "github.com/prometheus/client_golang/api" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" - log "github.com/sirupsen/logrus" flag "github.com/spf13/pflag" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -22,6 +21,8 @@ import ( "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" kubectldrain "k8s.io/kubectl/pkg/drain" + "log" + "log/slog" "net/http" "net/url" "os" @@ -205,83 +206,100 @@ func main() { // Load flags from environment variables LoadFromEnv() - log.Infof("Kubernetes Reboot Daemon: %s", version) - - if logFormat == "json" { - log.SetFormatter(&log.JSONFormatter{}) + var logger *slog.Logger + switch logFormat { + case "json": + logger = slog.New(slog.NewJSONHandler(os.Stdout, nil)) + case "text": + logger = slog.New(slog.NewTextHandler(os.Stdout, nil)) + default: + logger = slog.New(slog.NewTextHandler(os.Stdout, nil)) + logger.Info("incorrect configuration for logFormat, using text handler") } + // For all the old calls using logger + slog.SetDefault(logger) if nodeID == "" { - log.Fatal("KURED_NODE_ID environment variable required") + slog.Error("KURED_NODE_ID environment variable required") + os.Exit(1) + } + + window, err := timewindow.New(rebootDays, rebootStart, rebootEnd, timezone) + if err != nil { + // TODO: Improve stacktrace with slog + slog.Error(fmt.Sprintf("Failed to build time window: %v", err)) + os.Exit(2) } - log.Infof("Node ID: %s", nodeID) notifyURL = validateNotificationURL(notifyURL, slackHookURL) - err := validateNodeLabels(preRebootNodeLabels, postRebootNodeLabels) + err = validateNodeLabels(preRebootNodeLabels, postRebootNodeLabels) if err != nil { - log.Warnf(err.Error()) + slog.Info(err.Error(), "node", nodeID) } - log.Infof("PreferNoSchedule taint: %s", preferNoScheduleTaintName) - - // This should be printed from blocker list instead of only blocking pod selectors - log.Infof("Blocking Pod Selectors: %v", podSelectors) + slog.Info("Starting Kubernetes Reboot Daemon", + "version", version, + "node", nodeID, + "rebootPeriod", period, + "concurrency", concurrency, + "schedule", window, + "method", rebootMethod, + ) - log.Infof("Reboot period %v", period) - log.Infof("Concurrency: %v", concurrency) + slog.Info(fmt.Sprintf("preferNoSchedule taint: (%s)", preferNoScheduleTaintName), "node", nodeID) if annotateNodes { - log.Infof("Will annotate nodes during kured reboot operations") + slog.Info("Will annotate nodes during kured reboot operations", "node", nodeID) } - // Now call the rest of the main loop. - window, err := timewindow.New(rebootDays, rebootStart, rebootEnd, timezone) - if err != nil { - log.Fatalf("Failed to build time window: %v", err) - } - log.Infof("Reboot schedule: %v", window) - - log.Infof("Reboot method: %s", rebootMethod) - rebooter, err := reboot.NewRebooter(rebootMethod, rebootCommand, rebootSignal, rebootDelay, true, 1) if err != nil { - log.Fatalf("Failed to build rebooter: %v", err) + slog.Error(fmt.Sprintf("unrecoverable error - failed to construct system rebooter: %v", err)) + os.Exit(3) } rebootChecker, err := checkers.NewRebootChecker(rebootSentinelCommand, rebootSentinelFile) if err != nil { - log.Fatalf("Failed to build reboot checker: %v", err) + slog.Error(fmt.Sprintf("unrecoverable error - failed to build reboot checker: %v", err)) + os.Exit(4) } config, err := rest.InClusterConfig() if err != nil { - log.Fatal(err) + slog.Error(fmt.Sprintf("unrecoverable error - failed to load in cluster kubernetes config: %v", err)) + os.Exit(5) } client, err := kubernetes.NewForConfig(config) if err != nil { - log.Fatal(err) + slog.Error(fmt.Sprintf("unrecoverable error - failed to load in cluster kubernetes config: %v", err)) + os.Exit(5) } var blockCheckers []blockers.RebootBlocker if prometheusURL != "" { + slog.Info(fmt.Sprintf("Blocking reboot with prometheus alerts on %v", prometheusURL), "node", nodeID) blockCheckers = append(blockCheckers, blockers.NewPrometheusBlockingChecker(papi.Config{Address: prometheusURL}, alertFilter.Regexp, alertFiringOnly, alertFilterMatchOnly)) } if podSelectors != nil { + slog.Info(fmt.Sprintf("Blocking Pod Selectors: %v", podSelectors), "node", nodeID) blockCheckers = append(blockCheckers, blockers.NewKubernetesBlockingChecker(client, nodeID, podSelectors)) } - log.Infof("Lock Annotation: %s/%s:%s", dsNamespace, dsName, lockAnnotation) + + slog.Info(fmt.Sprintf("Lock Annotation: %s/%s:%s", dsNamespace, dsName, lockAnnotation), "node", nodeID) if lockTTL > 0 { - log.Infof("Lock TTL set, lock will expire after: %v", lockTTL) + slog.Info(fmt.Sprintf("Lock TTL set, lock will expire after: %v", lockTTL), "node", nodeID) } else { - log.Info("Lock TTL not set, lock will remain until being released") + slog.Info("Lock TTL not set, lock will remain until being released", "node", nodeID) } + if lockReleaseDelay > 0 { - log.Infof("Lock release delay set, lock release will be delayed by: %v", lockReleaseDelay) + slog.Info(fmt.Sprintf("Lock release delay set, lock release will be delayed by: %v", lockReleaseDelay), "node", nodeID) } else { - log.Info("Lock release delay not set, lock will be released immediately after rebooting") + slog.Info("Lock release delay not set, lock will be released immediately after rebooting", "node", nodeID) } + lock := daemonsetlock.New(client, nodeID, dsNamespace, dsName, lockAnnotation, lockTTL, concurrency, lockReleaseDelay) go rebootAsRequired(nodeID, rebooter, rebootChecker, blockCheckers, window, lock, client, period) @@ -307,22 +325,23 @@ func validateNodeLabels(preRebootNodeLabels []string, postRebootNodeLabels []str return nil } +// Remove old flags func validateNotificationURL(notifyURL string, slackHookURL string) string { switch { case slackHookURL != "" && notifyURL != "": - log.Warnf("Cannot use both --notify-url (given: %v) and --slack-hook-url (given: %v) flags. Kured will only use --notify-url flag", slackHookURL, notifyURL) + slog.Error(fmt.Sprintf("Cannot use both --notify-url (given: %v) and --slack-hook-url (given: %v) flags. Kured will only use --notify-url flag", slackHookURL, notifyURL)) return validateNotificationURL(notifyURL, "") case notifyURL != "": return stripQuotes(notifyURL) case slackHookURL != "": - log.Warnf("Deprecated flag(s). Please use --notify-url flag instead.") + slog.Info("Deprecated flag(s). Please use --notify-url flag instead.") parsedURL, err := url.Parse(stripQuotes(slackHookURL)) if err != nil { - log.Errorf("slack-hook-url is not properly formatted... no notification will be sent: %v\n", err) + slog.Info(fmt.Sprintf("slack-hook-url is not properly formatted... no notification will be sent: %v\n", err)) return "" } if len(strings.Split(strings.Replace(parsedURL.Path, "/services/", "", -1), "/")) != 3 { - log.Errorf("slack-hook-url is not properly formatted... no notification will be sent: unexpected number of / in URL\n") + slog.Error(fmt.Sprintf("slack-hook-url is not properly formatted... no notification will be sent: unexpected number of / in URL\n")) return "" } return fmt.Sprintf("slack://%s", strings.Replace(parsedURL.Path, "/services/", "", -1)) @@ -405,26 +424,48 @@ func stripQuotes(str string) string { return str } +type slogWriter struct { + stream string + message string +} + +func (sw slogWriter) Write(p []byte) (n int, err error) { + output := string(p) + switch sw.stream { + case "stdout": + slog.Info(sw.message, "node", nodeID, "stdout", output) + case "stderr": + slog.Info(sw.message, "node", nodeID, "stderr", output) + } + return len(p), nil +} + func drain(client *kubernetes.Clientset, node *v1.Node) error { nodename := node.GetName() if preRebootNodeLabels != nil { - updateNodeLabels(client, node, preRebootNodeLabels) + err := updateNodeLabels(client, node, preRebootNodeLabels) + if err != nil { + return fmt.Errorf("stopping drain due to problem with node labels %v", err) + } } if drainDelay > 0 { - log.Infof("Delaying drain for %v", drainDelay) + slog.Debug("Delaying drain", "delay", drainDelay, "node", nodename) time.Sleep(drainDelay) } - log.Infof("Draining node %s", nodename) + slog.Info("Starting drain", "node", nodename) if notifyURL != "" { if err := shoutrrr.Send(notifyURL, fmt.Sprintf(messageTemplateDrain, nodename)); err != nil { - log.Warnf("Error notifying: %v", err) + slog.Info(fmt.Sprintf("Error notifying but continuing drain: %v", err), "node", nodename) } } + kubectlStdOutLogger := &slogWriter{message: "draining: results", stream: "stdout"} + kubectlStdErrLogger := &slogWriter{message: "draining: results", stream: "stderr"} + drainer := &kubectldrain.Helper{ Client: client, Ctx: context.Background(), @@ -434,37 +475,37 @@ func drain(client *kubernetes.Clientset, node *v1.Node) error { Force: true, DeleteEmptyDirData: true, IgnoreAllDaemonSets: true, - ErrOut: os.Stderr, - Out: os.Stdout, + ErrOut: kubectlStdErrLogger, + Out: kubectlStdOutLogger, Timeout: drainTimeout, } if err := kubectldrain.RunCordonOrUncordon(drainer, node, true); err != nil { - log.Errorf("Error cordonning %s: %v", nodename, err) - return err + return fmt.Errorf("error cordonning node %s, %v", nodename, err) } if err := kubectldrain.RunNodeDrain(drainer, nodename); err != nil { - log.Errorf("Error draining %s: %v", nodename, err) - return err + return fmt.Errorf("error draining node %s: %v", nodename, err) } return nil } func uncordon(client *kubernetes.Clientset, node *v1.Node) error { nodename := node.GetName() - log.Infof("Uncordoning node %s", nodename) + kubectlStdOutLogger := &slogWriter{message: "uncordon: results", stream: "stdout"} + kubectlStdErrLogger := &slogWriter{message: "uncordon: results", stream: "stderr"} + drainer := &kubectldrain.Helper{ Client: client, - ErrOut: os.Stderr, - Out: os.Stdout, + ErrOut: kubectlStdErrLogger, + Out: kubectlStdOutLogger, Ctx: context.Background(), } if err := kubectldrain.RunCordonOrUncordon(drainer, node, false); err != nil { - log.Fatalf("Error uncordonning %s: %v", nodename, err) - return err + return fmt.Errorf("error uncordonning node %s: %v", nodename, err) } else if postRebootNodeLabels != nil { - updateNodeLabels(client, node, postRebootNodeLabels) + err := updateNodeLabels(client, node, postRebootNodeLabels) + return fmt.Errorf("error updating node (%s) labels, needs manual intervention %v", nodename, err) } return nil } @@ -472,18 +513,16 @@ func uncordon(client *kubernetes.Clientset, node *v1.Node) error { func addNodeAnnotations(client *kubernetes.Clientset, nodeID string, annotations map[string]string) error { node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{}) if err != nil { - log.Errorf("Error retrieving node object via k8s API: %s", err) - return err + return fmt.Errorf("error retrieving node object via k8s API: %v", err) } for k, v := range annotations { node.Annotations[k] = v - log.Infof("Adding node %s annotation: %s=%s", node.GetName(), k, v) + slog.Debug(fmt.Sprintf("adding node annotation: %s=%s", k, v), "node", node.GetName()) } bytes, err := json.Marshal(node) if err != nil { - log.Errorf("Error marshalling node object into JSON: %v", err) - return err + return fmt.Errorf("error marshalling node object into JSON: %v", err) } _, err = client.CoreV1().Nodes().Patch(context.TODO(), node.GetName(), types.StrategicMergePatchType, bytes, metav1.PatchOptions{}) @@ -492,34 +531,30 @@ func addNodeAnnotations(client *kubernetes.Clientset, nodeID string, annotations for k, v := range annotations { annotationsErr += fmt.Sprintf("%s=%s ", k, v) } - log.Errorf("Error adding node annotations %s via k8s API: %v", annotationsErr, err) - return err + return fmt.Errorf("error adding node annotations %s via k8s API: %v", annotationsErr, err) } return nil } func deleteNodeAnnotation(client *kubernetes.Clientset, nodeID, key string) error { - log.Infof("Deleting node %s annotation %s", nodeID, key) - // JSON Patch takes as path input a JSON Pointer, defined in RFC6901 // So we replace all instances of "/" with "~1" as per: // https://tools.ietf.org/html/rfc6901#section-3 patch := []byte(fmt.Sprintf("[{\"op\":\"remove\",\"path\":\"/metadata/annotations/%s\"}]", strings.ReplaceAll(key, "/", "~1"))) _, err := client.CoreV1().Nodes().Patch(context.TODO(), nodeID, types.JSONPatchType, patch, metav1.PatchOptions{}) if err != nil { - log.Errorf("Error deleting node annotation %s via k8s API: %v", key, err) - return err + return fmt.Errorf("error deleting node annotation %s via k8s API: %v", key, err) } return nil } -func updateNodeLabels(client *kubernetes.Clientset, node *v1.Node, labels []string) { +func updateNodeLabels(client *kubernetes.Clientset, node *v1.Node, labels []string) error { labelsMap := make(map[string]string) for _, label := range labels { k := strings.Split(label, "=")[0] v := strings.Split(label, "=")[1] labelsMap[k] = v - log.Infof("Updating node %s label: %s=%s", node.GetName(), k, v) + slog.Debug(fmt.Sprintf("Updating node %s label: %s=%s", node.GetName(), k, v), "node", node.GetName()) } bytes, err := json.Marshal(map[string]interface{}{ @@ -528,7 +563,7 @@ func updateNodeLabels(client *kubernetes.Clientset, node *v1.Node, labels []stri }, }) if err != nil { - log.Fatalf("Error marshalling node object into JSON: %v", err) + return fmt.Errorf("error marshalling node object into JSON: %v", err) } _, err = client.CoreV1().Nodes().Patch(context.TODO(), node.GetName(), types.StrategicMergePatchType, bytes, metav1.PatchOptions{}) @@ -539,8 +574,9 @@ func updateNodeLabels(client *kubernetes.Clientset, node *v1.Node, labels []stri v := strings.Split(label, "=")[1] labelsErr += fmt.Sprintf("%s=%s ", k, v) } - log.Errorf("Error updating node labels %s via k8s API: %v", labelsErr, err) + return fmt.Errorf("error updating node labels %s via k8s API: %v", labelsErr, err) } + return nil } func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers.Checker, blockCheckers []blockers.RebootBlocker, window *timewindow.TimeWindow, lock daemonsetlock.Lock, client *kubernetes.Clientset, period time.Duration) { @@ -564,7 +600,9 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. // Test API server first. If cannot get node, we should not do anything. node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{}) if err != nil { - log.Infof("Error retrieving node object via k8s API: %v", err) + // Only debug level even though the API is dead: Kured should not worry about transient + // issues, the k8s cluster admin should be aware already + slog.Debug(fmt.Sprintf("error retrieving node object via k8s API: %v.\nPlease check API", err), "node", nodeID, "error", err) continue } @@ -573,7 +611,8 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. // TODO: Need to move to another method to check the current data of the lock relevant for this node holding, lockData, err := lock.Holding() if err != nil { - log.Infof("Error checking lock - Not applying any action: %v", err) + // Internal wiring, admin does not need to be aware unless debugging session + slog.Debug(fmt.Sprintf("Error checking lock - Not applying any action: %v.\nPlease check API", err), "node", nodeID, "error", err) continue } @@ -590,16 +629,20 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. // Split into two lines to remember I need to remove the first // condition ;) if node.Spec.Unschedulable != lockData.Metadata.Unschedulable && lockData.Metadata.Unschedulable == false { + // Important lifecycle event - admin should be aware + slog.Info("uncordoning node", "node", nodeID) err = uncordon(client, node) if err != nil { - log.Infof("Unable to uncordon %s: %v, will continue to hold lock and retry uncordon", node.GetName(), err) + // Transient API issue - no problem, will retry. No need to worry the admin for this + slog.Debug(fmt.Sprintf("Unable to uncordon %s: %v, will continue to hold lock and retry uncordon", node.GetName(), err), "node", nodeID, "error", err) continue } // TODO, modify the actions to directly log and notify, instead of individual methods giving // an incomplete view of the lifecycle if notifyURL != "" { if err := shoutrrr.Send(notifyURL, fmt.Sprintf(messageTemplateUncordon, nodeID)); err != nil { - log.Warnf("Error notifying: %v", err) + // admin might want to act upon this -- raising to info level + slog.Info(fmt.Sprintf("Error notifying: %v", err), "node", nodeID, "error", err) } } } @@ -609,13 +652,16 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. // Releasing lock earlier is nice for other nodes err = lock.Release() if err != nil { - log.Infof("Error releasing lock, will retry: %v", err) + // Lock release is an internal thing, do not worry the admin too much + slog.Debug("Error releasing lock, will retry", "node", nodeID, "error", err) continue } // Regardless or not we are holding the lock // The node should not say it's still in progress if the reboot is done if annotateNodes { if _, ok := node.Annotations[KuredRebootInProgressAnnotation]; ok { + // Who reads this? I hope nobody bothers outside real debug cases + slog.Debug(fmt.Sprintf("Deleting node %s annotation %s", nodeID, KuredRebootInProgressAnnotation), "node", nodeID) err := deleteNodeAnnotation(client, nodeID, KuredRebootInProgressAnnotation) if err != nil { continue @@ -628,7 +674,8 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. // Act on reboot required. if !window.Contains(time.Now()) { - log.Debugf("Reboot required for node %v, but outside maintenance window", nodeID) + // Probably spamming outside the maintenance window. This should not be logged as info + slog.Debug("reboot required but outside maintenance window", "node", nodeID) continue } // moved up, because we should not put an annotation "Going to be rebooting", if @@ -637,7 +684,8 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. if blocked, currentlyBlocking := blockers.RebootBlocked(blockCheckers...); blocked { for _, blocker := range currentlyBlocking { rebootBlockedCounter.WithLabelValues(nodeID, blocker.MetricLabel()).Inc() - log.Infof("Reboot blocked by %v", blocker.MetricLabel()) + // Important lifecycle event -- tried to reboot, but was blocked! + slog.Info(fmt.Sprintf("reboot blocked by %v", blocker.MetricLabel()), "node", nodeID) } continue } @@ -645,7 +693,8 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. // Test API server first. If cannot get node, we should not do anything. node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{}) if err != nil { - log.Debugf("Error retrieving node object via k8s API: %v", err) + // Not important to worry the admin + slog.Debug("error retrieving node object via k8s API, retrying at next tick", "node", nodeID, "error", err) continue } // nodeMeta contains the node Metadata that should be included in the lock @@ -673,18 +722,21 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. // This could be merged into a single idempotent "Acquire" lock holding, _, err := lock.Holding() if err != nil { - log.Debugf("Error testing lock: %v", err) + // Not important to worry the admin + slog.Debug("error testing lock", "node", nodeID, "error", err) continue } if !holding { acquired, holder, err := lock.Acquire(nodeMeta) if err != nil { - log.Debugf("Error acquiring lock: %v", err) + // Not important to worry the admin + slog.Debug("error acquiring lock, will retry at next tick", "node", nodeID, "error", err) continue } if !acquired { - log.Infof("Lock already held by %v, will retry", holder) + // Not very important - lock prevents action + slog.Debug(fmt.Sprintf("Lock already held by %v, will retry at next tick", holder), "node", nodeID) continue } } @@ -692,27 +744,32 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. err = drain(client, node) if err != nil { if !forceReboot { - log.Infof("Unable to cordon or drain %s: %v, will force-reboot by releasing lock and uncordon until next success", node.GetName(), err) + slog.Debug(fmt.Sprintf("Unable to cordon or drain %s: %v, will force-reboot by releasing lock and uncordon until next success", node.GetName(), err), "node", nodeID, "error", err) err = lock.Release() if err != nil { - log.Infof("Error in best-effort releasing lock: %v", err) + slog.Debug(fmt.Sprintf("error in best-effort releasing lock: %v", err), "node", nodeID, "error", err) } - log.Infof("Performing a best-effort uncordon after failed cordon and drain") + // this is important -- if it's not shown, it means that (on normal / non-force reboot case) the drain was + // in error, the lock was NOT released. + // If shown, it is helping understand the uncordonning. If the admin seems the node as cordonned + // with this, it needs to take action (for example if the node was previously cordonned!) + slog.Info("Performing a best-effort uncordon after failed cordon and drain", "node", nodeID) uncordon(client, node) continue } } if notifyURL != "" { if err := shoutrrr.Send(notifyURL, fmt.Sprintf(messageTemplateReboot, nodeID)); err != nil { - log.Infof("Error notifying: %v", err) + // most likely important to see: if not notified, at least it would be logged! + slog.Info("Error sending notification for reboot", "node", nodeID, "error", err) } } - - log.Infof("Triggering reboot for node %v", nodeID) + // important lifecycle event + slog.Info(fmt.Sprintf("Triggering reboot for node %v", nodeID), "node", nodeID) rebooter.Reboot() for { - log.Infof("Waiting for reboot") + slog.Info("Waiting for reboot", "node", nodeID) time.Sleep(time.Minute) } } diff --git a/go.mod b/go.mod index 86892995d..990db913a 100644 --- a/go.mod +++ b/go.mod @@ -9,7 +9,6 @@ require ( github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 github.com/prometheus/client_golang v1.20.5 github.com/prometheus/common v0.60.1 - github.com/sirupsen/logrus v1.9.3 github.com/spf13/pflag v1.0.5 github.com/stretchr/testify v1.9.0 k8s.io/api v0.29.9 diff --git a/go.sum b/go.sum index 841172484..2c3fea46a 100644 --- a/go.sum +++ b/go.sum @@ -170,8 +170,6 @@ github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/sergi/go-diff v1.1.0 h1:we8PVUC3FE2uYfodKH/nBHMSetSfHDR6scGdBi+erh0= github.com/sergi/go-diff v1.1.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM= -github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= -github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM= github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y= github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= diff --git a/internal/daemonsetlock/daemonsetlock.go b/internal/daemonsetlock/daemonsetlock.go index ba300f9de..54a698613 100644 --- a/internal/daemonsetlock/daemonsetlock.go +++ b/internal/daemonsetlock/daemonsetlock.go @@ -4,7 +4,7 @@ import ( "context" "encoding/json" "fmt" - log "github.com/sirupsen/logrus" + "log/slog" "strings" "time" @@ -197,7 +197,7 @@ func (dsl *DaemonSetSingleLock) Holding() (bool, LockAnnotationValue, error) { // Release attempts to remove the lock data from the kured ds annotations using client-go func (dsl *DaemonSetSingleLock) Release() error { if dsl.releaseDelay > 0 { - log.Infof("Waiting %v before releasing lock", dsl.releaseDelay) + slog.Info(fmt.Sprintf("Waiting %v before releasing lock", dsl.releaseDelay)) time.Sleep(dsl.releaseDelay) } for { @@ -357,7 +357,7 @@ func (dsl *DaemonSetMultiLock) Holding() (bool, LockAnnotationValue, error) { // Release attempts to remove the lock data for a single node from the multi node annotation func (dsl *DaemonSetMultiLock) Release() error { if dsl.releaseDelay > 0 { - log.Infof("Waiting %v before releasing lock", dsl.releaseDelay) + slog.Info(fmt.Sprintf("Waiting %v before releasing lock", dsl.releaseDelay)) time.Sleep(dsl.releaseDelay) } for { diff --git a/internal/taints/taints.go b/internal/taints/taints.go index bd59e8d6a..1d92cc56f 100644 --- a/internal/taints/taints.go +++ b/internal/taints/taints.go @@ -4,12 +4,12 @@ import ( "context" "encoding/json" "fmt" - - log "github.com/sirupsen/logrus" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/kubernetes" + "log/slog" + "os" ) // Taint allows to set soft and hard limitations for scheduling and executing pods on nodes. @@ -67,7 +67,9 @@ func (t *Taint) Disable() { func taintExists(client *kubernetes.Clientset, nodeID, taintName string) (bool, int, *v1.Node) { updatedNode, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{}) if err != nil || updatedNode == nil { - log.Fatalf("Error reading node %s: %v", nodeID, err) + slog.Debug("Error reading node from API server", "node", nodeID, "error", err) + // TODO: Clarify with community if we need to exit + os.Exit(10) } for i, taint := range updatedNode.Spec.Taints { @@ -83,12 +85,12 @@ func preferNoSchedule(client *kubernetes.Clientset, nodeID, taintName string, ef taintExists, offset, updatedNode := taintExists(client, nodeID, taintName) if taintExists && shouldExists { - log.Debugf("Taint %v exists already for node %v.", taintName, nodeID) + slog.Debug(fmt.Sprintf("Taint %v exists already for node %v.", taintName, nodeID), "node", nodeID) return } if !taintExists && !shouldExists { - log.Debugf("Taint %v already missing for node %v.", taintName, nodeID) + slog.Debug(fmt.Sprintf("Taint %v already missing for node %v.", taintName, nodeID), "node", nodeID) return } @@ -150,17 +152,21 @@ func preferNoSchedule(client *kubernetes.Clientset, nodeID, taintName string, ef patchBytes, err := json.Marshal(patches) if err != nil { - log.Fatalf("Error encoding taint patch for node %s: %v", nodeID, err) + slog.Debug(fmt.Sprintf("Error encoding taint patch for node %s: %v", nodeID, err), "node", nodeID, "error", err) + // TODO: Clarify if we really need to exit with the community + os.Exit(10) } _, err = client.CoreV1().Nodes().Patch(context.TODO(), nodeID, types.JSONPatchType, patchBytes, metav1.PatchOptions{}) if err != nil { - log.Fatalf("Error patching taint for node %s: %v", nodeID, err) + // TODO: Clarify if we really need to exit with the community + slog.Debug(fmt.Sprintf("Error patching taint for node %s: %v", nodeID, err), "node", nodeID, "error", err) + os.Exit(10) } if shouldExists { - log.Info("Node taint added") + slog.Info("Node taint added", "node", nodeID) } else { - log.Info("Node taint removed") + slog.Info("Node taint removed", "node", nodeID) } } diff --git a/pkg/blockers/kubernetespod.go b/pkg/blockers/kubernetespod.go index 21f38c6ec..85f369733 100644 --- a/pkg/blockers/kubernetespod.go +++ b/pkg/blockers/kubernetespod.go @@ -3,9 +3,9 @@ package blockers import ( "context" "fmt" - log "github.com/sirupsen/logrus" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" + "log/slog" ) // Compile-time checks to ensure the type implements the interface @@ -41,7 +41,7 @@ func (kb KubernetesBlockingChecker) IsBlocked() bool { FieldSelector: fieldSelector, Limit: 10}) if err != nil { - log.Warnf("Reboot blocked: pod query error: %v", err) + slog.Info("Reboot blocked: pod query error", "error", err) return true } @@ -53,7 +53,7 @@ func (kb KubernetesBlockingChecker) IsBlocked() bool { if len(podList.Continue) > 0 { podNames = append(podNames, "...") } - log.Warnf("Reboot blocked: matching pods: %v", podNames) + slog.Info(fmt.Sprintf("Reboot blocked due pods matching the patterns: %v", podNames)) return true } } diff --git a/pkg/blockers/prometheus.go b/pkg/blockers/prometheus.go index 9c386f476..d0ba62c3d 100644 --- a/pkg/blockers/prometheus.go +++ b/pkg/blockers/prometheus.go @@ -6,7 +6,7 @@ import ( papi "github.com/prometheus/client_golang/api" v1 "github.com/prometheus/client_golang/api/prometheus/v1" "github.com/prometheus/common/model" - log "github.com/sirupsen/logrus" + "log/slog" "regexp" "sort" "time" @@ -51,7 +51,7 @@ func NewPrometheusBlockingChecker(config papi.Config, alertFilter *regexp.Regexp func (pb PrometheusBlockingChecker) IsBlocked() bool { alertNames, err := pb.ActiveAlerts() if err != nil { - log.Warnf("Reboot blocked: prometheus query error: %v", err) + slog.Info("Reboot blocked: prometheus query error", "error", err) return true } count := len(alertNames) @@ -59,7 +59,7 @@ func (pb PrometheusBlockingChecker) IsBlocked() bool { alertNames = append(alertNames[:10], "...") } if count > 0 { - log.Warnf("Reboot blocked: %d active alerts: %v", count, alertNames) + slog.Info(fmt.Sprintf("Reboot blocked: %d active alerts: %v", count, alertNames)) return true } return false diff --git a/pkg/checkers/checker.go b/pkg/checkers/checker.go index 1693184e2..65e46ab82 100644 --- a/pkg/checkers/checker.go +++ b/pkg/checkers/checker.go @@ -1,6 +1,9 @@ package checkers -import "github.com/sirupsen/logrus" +import ( + "fmt" + "log/slog" +) // Checker is the standard interface to use to check // if a reboot is required. Its types must implement a @@ -15,9 +18,9 @@ type Checker interface { func NewRebootChecker(rebootSentinelCommand string, rebootSentinelFile string) (Checker, error) { // An override of rebootSentinelCommand means a privileged command if rebootSentinelCommand != "" { - logrus.Infof("Sentinel checker is (privileged) user provided command: %s", rebootSentinelCommand) + slog.Info(fmt.Sprintf("Sentinel checker is (privileged) user provided command: %s", rebootSentinelCommand)) return NewCommandChecker(rebootSentinelCommand, 1, true) } - logrus.Infof("Sentinel checker is (unprivileged) testing for the presence of: %s", rebootSentinelFile) + slog.Info("Sentinel checker is (unprivileged) testing for the presence of the file: " + rebootSentinelFile) return NewFileRebootChecker(rebootSentinelFile) } diff --git a/pkg/checkers/commandchecker.go b/pkg/checkers/commandchecker.go index 76185a0f6..5e0a15069 100644 --- a/pkg/checkers/commandchecker.go +++ b/pkg/checkers/commandchecker.go @@ -4,7 +4,8 @@ import ( "bytes" "fmt" "github.com/google/shlex" - log "github.com/sirupsen/logrus" + "log/slog" + "os" "os/exec" "strings" ) @@ -19,6 +20,8 @@ type CommandChecker struct { Privileged bool } +var exitFunc = os.Exit + // RebootRequired for CommandChecker runs a command without returning // any eventual error. This should be later refactored to return the errors, // instead of logging and fataling them here. @@ -38,15 +41,17 @@ func (rc CommandChecker) RebootRequired() bool { // is the right thing to do, and we are logging stdout/stderr of the command // so it should be obvious what is wrong. if cmd.ProcessState.ExitCode() != 1 { - log.Warn(fmt.Sprintf("sentinel command ended with unexpected exit code: %v", cmd.ProcessState.ExitCode()), "cmd", strings.Join(cmd.Args, " "), "stdout", bufStdout.String(), "stderr", bufStderr.String()) + slog.Info(fmt.Sprintf("sentinel command ended with unexpected exit code: %v, preventing reboot", cmd.ProcessState.ExitCode()), "cmd", strings.Join(cmd.Args, " "), "stdout", bufStdout.String(), "stderr", bufStderr.String()) } return false default: // Something was grossly misconfigured, such as the command path being wrong. - log.Fatal(fmt.Sprintf("Error invoking sentinel command: %v", err), "cmd", strings.Join(cmd.Args, " "), "stdout", bufStdout.String(), "stderr", bufStderr.String()) + slog.Error(fmt.Sprintf("Error invoking sentinel command: %v", err), "cmd", strings.Join(cmd.Args, " "), "stdout", bufStdout.String(), "stderr", bufStderr.String()) + // exitFunc is in indirection to help testing + exitFunc(11) } } - log.Info("checking if reboot is required", "cmd", strings.Join(cmd.Args, " "), "stdout", bufStdout.String(), "stderr", bufStderr.String()) + slog.Debug("reboot is required", "cmd", strings.Join(cmd.Args, " "), "stdout", bufStdout.String(), "stderr", bufStderr.String()) return true } diff --git a/pkg/checkers/commandchecker_test.go b/pkg/checkers/commandchecker_test.go index 481cca153..066c6dedc 100644 --- a/pkg/checkers/commandchecker_test.go +++ b/pkg/checkers/commandchecker_test.go @@ -1,7 +1,6 @@ package checkers import ( - log "github.com/sirupsen/logrus" "reflect" "testing" ) @@ -70,9 +69,14 @@ func Test_rebootRequired(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - defer func() { log.StandardLogger().ExitFunc = nil }() fatal := false - log.StandardLogger().ExitFunc = func(int) { fatal = true } + + exitFunc = func(code int) { + fatal = true + } + + //// Reset exitFunc after the test + //defer func() { exitFunc = os.Exit } a := CommandChecker{CheckCommand: tt.args.sentinelCommand, NamespacePid: 1, Privileged: false} diff --git a/pkg/reboot/command.go b/pkg/reboot/command.go index 3bde3bc01..8e2ba86d9 100644 --- a/pkg/reboot/command.go +++ b/pkg/reboot/command.go @@ -4,7 +4,7 @@ import ( "bytes" "fmt" "github.com/google/shlex" - log "github.com/sirupsen/logrus" + "log/slog" "os/exec" "strings" "time" @@ -19,7 +19,7 @@ type CommandRebooter struct { // Reboot triggers the reboot command func (c CommandRebooter) Reboot() error { c.DelayReboot() - log.Infof("Invoking command: %s", c.RebootCommand) + slog.Info("Invoking reboot command", "cmd", c.RebootCommand) bufStdout := new(bytes.Buffer) bufStderr := new(bytes.Buffer) @@ -30,7 +30,7 @@ func (c CommandRebooter) Reboot() error { if err := cmd.Run(); err != nil { return fmt.Errorf("error invoking reboot command %s: %v (stdout: %v, stderr: %v)", c.RebootCommand, err, bufStdout.String(), bufStderr.String()) } - log.Info("Invoked reboot command", "cmd", strings.Join(cmd.Args, " "), "stdout", bufStdout.String(), "stderr", bufStderr.String()) + slog.Info("Invoked reboot command", "cmd", strings.Join(cmd.Args, " "), "stdout", bufStdout.String(), "stderr", bufStderr.String()) return nil } diff --git a/pkg/reboot/reboot.go b/pkg/reboot/reboot.go index 355ce702d..958088d8c 100644 --- a/pkg/reboot/reboot.go +++ b/pkg/reboot/reboot.go @@ -2,7 +2,7 @@ package reboot import ( "fmt" - "github.com/sirupsen/logrus" + "log/slog" "time" ) @@ -19,10 +19,10 @@ type Rebooter interface { func NewRebooter(rebootMethod string, rebootCommand string, rebootSignal int, rebootDelay time.Duration, privileged bool, pid int) (Rebooter, error) { switch { case rebootMethod == "command": - logrus.Infof("Reboot command: %s", rebootCommand) + slog.Info("Will reboot using command", "cmd", rebootCommand) return NewCommandRebooter(rebootCommand, rebootDelay, true, 1) case rebootMethod == "signal": - logrus.Infof("Reboot signal: %d", rebootSignal) + slog.Info("Will reboot using signal", "signal", rebootSignal) return NewSignalRebooter(rebootSignal, rebootDelay) default: return nil, fmt.Errorf("invalid reboot-method configured %s, expected signal or command", rebootMethod) @@ -35,7 +35,7 @@ type GenericRebooter struct { func (g GenericRebooter) DelayReboot() { if g.RebootDelay > 0 { - logrus.Infof("Delayed reboot for %s", g.RebootDelay) + slog.Debug(fmt.Sprintf("Delayed reboot for %s", g.RebootDelay)) time.Sleep(g.RebootDelay) } } diff --git a/pkg/reboot/signal.go b/pkg/reboot/signal.go index 75d357497..d1754732b 100644 --- a/pkg/reboot/signal.go +++ b/pkg/reboot/signal.go @@ -2,7 +2,7 @@ package reboot import ( "fmt" - log "github.com/sirupsen/logrus" + "log/slog" "os" "syscall" "time" @@ -17,7 +17,7 @@ type SignalRebooter struct { // Reboot triggers the reboot signal func (c SignalRebooter) Reboot() error { c.DelayReboot() - log.Infof("Invoking signal: %v", c.Signal) + slog.Info("Invoking reboot signal", "signal", c.Signal) process, err := os.FindProcess(1) if err != nil { From 6cdbc160bdc7635e25a91093cfaa064904c311bd Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Sun, 10 Nov 2024 09:31:00 +0100 Subject: [PATCH 15/22] Fix typo in signal name signal is named SIGRTMIN not SIGTRMIN. It can be confusing for people. As this could also lead to questions, the code clarifies a TODO item: We are still relying on hardcoded ints, instead of evaluating the signal number at run time, as recommended by man (7) signal. Please read man (7) signal for further details. Signed-off-by: Jean-Philippe Evrard --- cmd/kured/main.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmd/kured/main.go b/cmd/kured/main.go index 079eab213..868cd96b4 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -104,8 +104,8 @@ const ( KuredMostRecentRebootNeededAnnotation string = "weave.works/kured-most-recent-reboot-needed" // EnvPrefix The environment variable prefix of all environment variables bound to our command line flags. EnvPrefix = "KURED" - - sigTrminPlus5 = 34 + 5 + // TODO: Replace this with runtime evaluation + sigRTMinPlus5 = 34 + 5 ) func init() { @@ -166,7 +166,7 @@ func main() { "command to run when a reboot is required") flag.IntVar(&concurrency, "concurrency", 1, "amount of nodes to concurrently reboot. Defaults to 1") - flag.IntVar(&rebootSignal, "reboot-signal", sigTrminPlus5, + flag.IntVar(&rebootSignal, "reboot-signal", sigRTMinPlus5, "signal to use for reboot, SIGRTMIN+5 by default.") flag.StringVar(&slackHookURL, "slack-hook-url", "", "slack hook URL for reboot notifications [deprecated in favor of --notify-url]") From 88658471b2a615ea12469273d953c45448a17432 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Mon, 11 Nov 2024 08:11:23 +0100 Subject: [PATCH 16/22] Move notifications to its own package The current code repeats itself and relies on global vars. This is a problem, as changing the notifications will move code all over, and actually prevents us to use a simple Send() method for all our notifications. This opens the door to a simple sender Sending events to multiple places (webhooks, logs, and a notification service) Signed-off-by: Jean-Philippe Evrard --- cmd/kured/main.go | 74 +++---------- cmd/kured/main_test.go | 75 ------------- internal/notifications/notifications.go | 93 ++++++++++++++++ internal/notifications/notifications_test.go | 106 +++++++++++++++++++ 4 files changed, 213 insertions(+), 135 deletions(-) create mode 100644 internal/notifications/notifications.go create mode 100644 internal/notifications/notifications_test.go diff --git a/cmd/kured/main.go b/cmd/kured/main.go index 868cd96b4..3c3285844 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -4,8 +4,8 @@ import ( "context" "encoding/json" "fmt" - "github.com/containrrr/shoutrrr" "github.com/kubereboot/kured/internal/daemonsetlock" + "github.com/kubereboot/kured/internal/notifications" "github.com/kubereboot/kured/internal/taints" "github.com/kubereboot/kured/internal/timewindow" "github.com/kubereboot/kured/pkg/blockers" @@ -24,7 +24,6 @@ import ( "log" "log/slog" "net/http" - "net/url" "os" "reflect" "sort" @@ -231,7 +230,7 @@ func main() { os.Exit(2) } - notifyURL = validateNotificationURL(notifyURL, slackHookURL) + notifier := notifications.NewNotifier(notifyURL, slackHookURL) err = validateNodeLabels(preRebootNodeLabels, postRebootNodeLabels) if err != nil { @@ -302,7 +301,7 @@ func main() { lock := daemonsetlock.New(client, nodeID, dsNamespace, dsName, lockAnnotation, lockTTL, concurrency, lockReleaseDelay) - go rebootAsRequired(nodeID, rebooter, rebootChecker, blockCheckers, window, lock, client, period) + go rebootAsRequired(nodeID, rebooter, rebootChecker, blockCheckers, window, lock, client, period, notifier) http.Handle("/metrics", promhttp.Handler()) log.Fatal(http.ListenAndServe(fmt.Sprintf("%s:%d", metricsHost, metricsPort), nil)) @@ -325,30 +324,6 @@ func validateNodeLabels(preRebootNodeLabels []string, postRebootNodeLabels []str return nil } -// Remove old flags -func validateNotificationURL(notifyURL string, slackHookURL string) string { - switch { - case slackHookURL != "" && notifyURL != "": - slog.Error(fmt.Sprintf("Cannot use both --notify-url (given: %v) and --slack-hook-url (given: %v) flags. Kured will only use --notify-url flag", slackHookURL, notifyURL)) - return validateNotificationURL(notifyURL, "") - case notifyURL != "": - return stripQuotes(notifyURL) - case slackHookURL != "": - slog.Info("Deprecated flag(s). Please use --notify-url flag instead.") - parsedURL, err := url.Parse(stripQuotes(slackHookURL)) - if err != nil { - slog.Info(fmt.Sprintf("slack-hook-url is not properly formatted... no notification will be sent: %v\n", err)) - return "" - } - if len(strings.Split(strings.Replace(parsedURL.Path, "/services/", "", -1), "/")) != 3 { - slog.Error(fmt.Sprintf("slack-hook-url is not properly formatted... no notification will be sent: unexpected number of / in URL\n")) - return "" - } - return fmt.Sprintf("slack://%s", strings.Replace(parsedURL.Path, "/services/", "", -1)) - } - return "" -} - // LoadFromEnv attempts to load environment variables corresponding to flags. // It looks for an environment variable with the uppercase version of the flag name (prefixed by EnvPrefix). func LoadFromEnv() { @@ -411,19 +386,6 @@ func LoadFromEnv() { } -// stripQuotes removes any literal single or double quote chars that surround a string -func stripQuotes(str string) string { - if len(str) > 2 { - firstChar := str[0] - lastChar := str[len(str)-1] - if firstChar == lastChar && (firstChar == '"' || firstChar == '\'') { - return str[1 : len(str)-1] - } - } - // return the original string if it has a length of zero or one - return str -} - type slogWriter struct { stream string message string @@ -440,7 +402,7 @@ func (sw slogWriter) Write(p []byte) (n int, err error) { return len(p), nil } -func drain(client *kubernetes.Clientset, node *v1.Node) error { +func drain(client *kubernetes.Clientset, node *v1.Node, notifier notifications.Notifier) error { nodename := node.GetName() if preRebootNodeLabels != nil { @@ -457,11 +419,7 @@ func drain(client *kubernetes.Clientset, node *v1.Node) error { slog.Info("Starting drain", "node", nodename) - if notifyURL != "" { - if err := shoutrrr.Send(notifyURL, fmt.Sprintf(messageTemplateDrain, nodename)); err != nil { - slog.Info(fmt.Sprintf("Error notifying but continuing drain: %v", err), "node", nodename) - } - } + notifier.Send(fmt.Sprintf(messageTemplateDrain, nodename), "Starting drain") kubectlStdOutLogger := &slogWriter{message: "draining: results", stream: "stdout"} kubectlStdErrLogger := &slogWriter{message: "draining: results", stream: "stderr"} @@ -579,7 +537,7 @@ func updateNodeLabels(client *kubernetes.Clientset, node *v1.Node, labels []stri return nil } -func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers.Checker, blockCheckers []blockers.RebootBlocker, window *timewindow.TimeWindow, lock daemonsetlock.Lock, client *kubernetes.Clientset, period time.Duration) { +func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers.Checker, blockCheckers []blockers.RebootBlocker, window *timewindow.TimeWindow, lock daemonsetlock.Lock, client *kubernetes.Clientset, period time.Duration, notifier notifications.Notifier) { preferNoScheduleTaint := taints.New(client, nodeID, preferNoScheduleTaintName, v1.TaintEffectPreferNoSchedule) @@ -639,12 +597,7 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. } // TODO, modify the actions to directly log and notify, instead of individual methods giving // an incomplete view of the lifecycle - if notifyURL != "" { - if err := shoutrrr.Send(notifyURL, fmt.Sprintf(messageTemplateUncordon, nodeID)); err != nil { - // admin might want to act upon this -- raising to info level - slog.Info(fmt.Sprintf("Error notifying: %v", err), "node", nodeID, "error", err) - } - } + notifier.Send(fmt.Sprintf(messageTemplateUncordon, nodeID), "Node uncordonned") } } @@ -741,7 +694,7 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. } } - err = drain(client, node) + err = drain(client, node, notifier) if err != nil { if !forceReboot { slog.Debug(fmt.Sprintf("Unable to cordon or drain %s: %v, will force-reboot by releasing lock and uncordon until next success", node.GetName(), err), "node", nodeID, "error", err) @@ -758,12 +711,13 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. continue } } - if notifyURL != "" { - if err := shoutrrr.Send(notifyURL, fmt.Sprintf(messageTemplateReboot, nodeID)); err != nil { - // most likely important to see: if not notified, at least it would be logged! - slog.Info("Error sending notification for reboot", "node", nodeID, "error", err) - } + notifier.Send(fmt.Sprintf(messageTemplateReboot, nodeID), "Node Reboot") + if err != nil { + // If notifications are not sent, lifecycle event of the reboot is still recorded, so it's no + // big deal, as long as it can be debugged. + slog.Debug("Error sending notification for reboot", "node", nodeID, "error", err) } + // important lifecycle event slog.Info(fmt.Sprintf("Triggering reboot for node %v", nodeID), "node", nodeID) diff --git a/cmd/kured/main_test.go b/cmd/kured/main_test.go index 4687d0e77..06ab7d0f9 100644 --- a/cmd/kured/main_test.go +++ b/cmd/kured/main_test.go @@ -1,76 +1 @@ package main - -import ( - "reflect" - "testing" -) - -func TestValidateNotificationURL(t *testing.T) { - - tests := []struct { - name string - slackHookURL string - notifyURL string - expected string - }{ - {"slackHookURL only works fine", "https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET", "", "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET"}, - {"slackHookURL and notify URL together only keeps notifyURL", "\"https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET\"", "teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com", "teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com"}, - {"slackHookURL removes extraneous double quotes", "\"https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET\"", "", "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET"}, - {"slackHookURL removes extraneous single quotes", "'https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET'", "", "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET"}, - {"notifyURL removes extraneous double quotes", "", "\"teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com\"", "teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com"}, - {"notifyURL removes extraneous single quotes", "", "'teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com'", "teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com"}, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got := validateNotificationURL(tt.notifyURL, tt.slackHookURL); !reflect.DeepEqual(got, tt.expected) { - t.Errorf("validateNotificationURL() = %v, expected %v", got, tt.expected) - } - }) - } -} - -func Test_stripQuotes(t *testing.T) { - tests := []struct { - name string - input string - expected string - }{ - { - name: "string with no surrounding quotes is unchanged", - input: "Hello, world!", - expected: "Hello, world!", - }, - { - name: "string with surrounding double quotes should strip quotes", - input: "\"Hello, world!\"", - expected: "Hello, world!", - }, - { - name: "string with surrounding single quotes should strip quotes", - input: "'Hello, world!'", - expected: "Hello, world!", - }, - { - name: "string with unbalanced surrounding quotes is unchanged", - input: "'Hello, world!\"", - expected: "'Hello, world!\"", - }, - { - name: "string with length of one is unchanged", - input: "'", - expected: "'", - }, - { - name: "string with length of zero is unchanged", - input: "", - expected: "", - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got := stripQuotes(tt.input); !reflect.DeepEqual(got, tt.expected) { - t.Errorf("stripQuotes() = %v, expected %v", got, tt.expected) - } - }) - } -} diff --git a/internal/notifications/notifications.go b/internal/notifications/notifications.go new file mode 100644 index 000000000..e0707d2d1 --- /dev/null +++ b/internal/notifications/notifications.go @@ -0,0 +1,93 @@ +package notifications + +import ( + "errors" + "fmt" + "github.com/containrrr/shoutrrr" + "github.com/containrrr/shoutrrr/pkg/router" + "github.com/containrrr/shoutrrr/pkg/types" + "log/slog" + "net/url" + "strings" +) + +type Notifier interface { + Send(string, string) error +} + +type NoopNotifier struct{} + +func (nn *NoopNotifier) Send(_ string, _ string) error { + return nil +} + +type ShoutrrrNotifier struct { + serviceRouter *router.ServiceRouter +} + +func (sn *ShoutrrrNotifier) Send(message string, title string) error { + params := &types.Params{} + params.SetTitle(title) + errs := sn.serviceRouter.Send(message, params) + var errList error + if errs != nil { + for _, err := range errs { + errList = errors.Join(errList, err) + } + return errList + } + return nil +} + +func NewNotifier(shoutrrrURL string, legacyURL string) Notifier { + URL := validateNotificationURL(shoutrrrURL, legacyURL) + switch URL { + case "": + return &NoopNotifier{} + default: + servicesURLs := strings.Split(URL, ",") + sr, err := shoutrrr.CreateSender(servicesURLs...) + if err != nil { + slog.Info("Could not create shoutrrr notifier. Will not notify", "url", URL) + return &NoopNotifier{} + } + return &ShoutrrrNotifier{serviceRouter: sr} + } +} + +// Remove old flags +func validateNotificationURL(notifyURL string, slackHookURL string) string { + switch { + case slackHookURL != "" && notifyURL != "": + slog.Error(fmt.Sprintf("Cannot use both --notify-url (given: %v) and --slack-hook-url (given: %v) flags. Kured will only use --notify-url flag", slackHookURL, notifyURL)) + return validateNotificationURL(notifyURL, "") + case notifyURL != "": + return stripQuotes(notifyURL) + case slackHookURL != "": + slog.Info("Deprecated flag(s). Please use --notify-url flag instead.") + parsedURL, err := url.Parse(stripQuotes(slackHookURL)) + if err != nil { + slog.Info(fmt.Sprintf("slack-hook-url is not properly formatted... no notification will be sent: %v\n", err)) + return "" + } + if len(strings.Split(strings.Replace(parsedURL.Path, "/services/", "", -1), "/")) != 3 { + slog.Error(fmt.Sprintf("slack-hook-url is not properly formatted... no notification will be sent: unexpected number of / in URL\n")) + return "" + } + return fmt.Sprintf("slack://%s", strings.Replace(parsedURL.Path, "/services/", "", -1)) + } + return "" +} + +// stripQuotes removes any literal single or double quote chars that surround a string +func stripQuotes(str string) string { + if len(str) > 2 { + firstChar := str[0] + lastChar := str[len(str)-1] + if firstChar == lastChar && (firstChar == '"' || firstChar == '\'') { + return str[1 : len(str)-1] + } + } + // return the original string if it has a length of zero or one + return str +} diff --git a/internal/notifications/notifications_test.go b/internal/notifications/notifications_test.go new file mode 100644 index 000000000..f6e32ee9e --- /dev/null +++ b/internal/notifications/notifications_test.go @@ -0,0 +1,106 @@ +package notifications + +import ( + "reflect" + "testing" +) + +func TestValidateNotificationURL(t *testing.T) { + + tests := []struct { + name string + slackHookURL string + notifyURL string + expected string + }{ + { + "slackHookURL only works fine", + "https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET", + "", + "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET", + }, + { + "slackHookURL and notify URL together only keeps notifyURL", + "\"https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET\"", + "teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com", + "teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com", + }, + { + "slackHookURL removes extraneous double quotes", + "\"https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET\"", + "", + "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET", + }, + { + "slackHookURL removes extraneous single quotes", + "'https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET'", + "", + "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET", + }, + { + "notifyURL removes extraneous double quotes", + "", + "\"teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com\"", + "teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com", + }, + { + "notifyURL removes extraneous single quotes", + "", + "'teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com'", + "teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := validateNotificationURL(tt.notifyURL, tt.slackHookURL); !reflect.DeepEqual(got, tt.expected) { + t.Errorf("validateNotificationURL() = %v, expected %v", got, tt.expected) + } + }) + } +} + +func Test_stripQuotes(t *testing.T) { + tests := []struct { + name string + input string + expected string + }{ + { + name: "string with no surrounding quotes is unchanged", + input: "Hello, world!", + expected: "Hello, world!", + }, + { + name: "string with surrounding double quotes should strip quotes", + input: "\"Hello, world!\"", + expected: "Hello, world!", + }, + { + name: "string with surrounding single quotes should strip quotes", + input: "'Hello, world!'", + expected: "Hello, world!", + }, + { + name: "string with unbalanced surrounding quotes is unchanged", + input: "'Hello, world!\"", + expected: "'Hello, world!\"", + }, + { + name: "string with length of one is unchanged", + input: "'", + expected: "'", + }, + { + name: "string with length of zero is unchanged", + input: "", + expected: "", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := stripQuotes(tt.input); !reflect.DeepEqual(got, tt.expected) { + t.Errorf("stripQuotes() = %v, expected %v", got, tt.expected) + } + }) + } +} From f3244d6ab15a482a04266f9de61358a69e65e410 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Mon, 11 Nov 2024 10:40:01 +0100 Subject: [PATCH 17/22] Remove deprecated notifications URLs Without this, we will rely on old urls forever. It has been multiple cycles we removed it, we are now in a good place to remove the vars. Signed-off-by: Jean-Philippe Evrard --- cmd/kured/main.go | 17 +-- internal/notifications/notifications.go | 59 ++++----- internal/notifications/notifications_test.go | 129 +++++++++++-------- 3 files changed, 102 insertions(+), 103 deletions(-) diff --git a/cmd/kured/main.go b/cmd/kured/main.go index 3c3285844..09341dc50 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -59,10 +59,7 @@ var ( alertFiringOnly bool rebootSentinelFile string rebootSentinelCommand string - notifyURL string - slackHookURL string - slackUsername string - slackChannel string + notifyURLs []string messageTemplateDrain string messageTemplateReboot string messageTemplateUncordon string @@ -167,14 +164,7 @@ func main() { "amount of nodes to concurrently reboot. Defaults to 1") flag.IntVar(&rebootSignal, "reboot-signal", sigRTMinPlus5, "signal to use for reboot, SIGRTMIN+5 by default.") - flag.StringVar(&slackHookURL, "slack-hook-url", "", - "slack hook URL for reboot notifications [deprecated in favor of --notify-url]") - flag.StringVar(&slackUsername, "slack-username", "kured", - "slack username for reboot notifications") - flag.StringVar(&slackChannel, "slack-channel", "", - "slack channel for reboot notifications") - flag.StringVar(¬ifyURL, "notify-url", "", - "notify URL for reboot notifications (cannot use with --slack-hook-url flags)") + flag.StringArrayVar(¬ifyURLs, "notify-url", nil, "notify URL for reboot notifications (can be repeated for multiple notifications)") flag.StringVar(&messageTemplateUncordon, "message-template-uncordon", "Node %s rebooted & uncordoned successfully!", "message template used to notify about a node being successfully uncordoned") flag.StringVar(&messageTemplateDrain, "message-template-drain", "Draining node %s", @@ -230,7 +220,7 @@ func main() { os.Exit(2) } - notifier := notifications.NewNotifier(notifyURL, slackHookURL) + notifier := notifications.NewNotifier(notifyURLs...) err = validateNodeLabels(preRebootNodeLabels, postRebootNodeLabels) if err != nil { @@ -379,6 +369,7 @@ func LoadFromEnv() { os.Exit(1) } default: + //String Arrays are not supported from CLI for example fmt.Printf("Unsupported flag type for %s\n", f.Name) } } diff --git a/internal/notifications/notifications.go b/internal/notifications/notifications.go index e0707d2d1..bbdd6f241 100644 --- a/internal/notifications/notifications.go +++ b/internal/notifications/notifications.go @@ -2,12 +2,10 @@ package notifications import ( "errors" - "fmt" "github.com/containrrr/shoutrrr" "github.com/containrrr/shoutrrr/pkg/router" "github.com/containrrr/shoutrrr/pkg/types" "log/slog" - "net/url" "strings" ) @@ -39,49 +37,38 @@ func (sn *ShoutrrrNotifier) Send(message string, title string) error { return nil } -func NewNotifier(shoutrrrURL string, legacyURL string) Notifier { - URL := validateNotificationURL(shoutrrrURL, legacyURL) - switch URL { - case "": +func NewNotifier(URLs ...string) Notifier { + if URLs == nil { return &NoopNotifier{} - default: - servicesURLs := strings.Split(URL, ",") - sr, err := shoutrrr.CreateSender(servicesURLs...) - if err != nil { - slog.Info("Could not create shoutrrr notifier. Will not notify", "url", URL) - return &NoopNotifier{} + } + var servicesURLs []string + for _, givenURL := range URLs { + serviceURL := stripQuotes(givenURL) + if serviceURL != "" { + servicesURLs = append(servicesURLs, serviceURL) } - return &ShoutrrrNotifier{serviceRouter: sr} + + } + if len(servicesURLs) == 0 { + return &NoopNotifier{} } -} -// Remove old flags -func validateNotificationURL(notifyURL string, slackHookURL string) string { - switch { - case slackHookURL != "" && notifyURL != "": - slog.Error(fmt.Sprintf("Cannot use both --notify-url (given: %v) and --slack-hook-url (given: %v) flags. Kured will only use --notify-url flag", slackHookURL, notifyURL)) - return validateNotificationURL(notifyURL, "") - case notifyURL != "": - return stripQuotes(notifyURL) - case slackHookURL != "": - slog.Info("Deprecated flag(s). Please use --notify-url flag instead.") - parsedURL, err := url.Parse(stripQuotes(slackHookURL)) - if err != nil { - slog.Info(fmt.Sprintf("slack-hook-url is not properly formatted... no notification will be sent: %v\n", err)) - return "" - } - if len(strings.Split(strings.Replace(parsedURL.Path, "/services/", "", -1), "/")) != 3 { - slog.Error(fmt.Sprintf("slack-hook-url is not properly formatted... no notification will be sent: unexpected number of / in URL\n")) - return "" - } - return fmt.Sprintf("slack://%s", strings.Replace(parsedURL.Path, "/services/", "", -1)) + sr, err := shoutrrr.CreateSender(servicesURLs...) + if err != nil { + slog.Info( + "Could not create shoutrrr notifier. Will not notify", + "urls", strings.Join(servicesURLs, " "), + "error", err, + ) + return &NoopNotifier{} } - return "" + + return &ShoutrrrNotifier{serviceRouter: sr} } // stripQuotes removes any literal single or double quote chars that surround a string func stripQuotes(str string) string { - if len(str) > 2 { + if len(str) >= 2 { firstChar := str[0] lastChar := str[len(str)-1] if firstChar == lastChar && (firstChar == '"' || firstChar == '\'') { diff --git a/internal/notifications/notifications_test.go b/internal/notifications/notifications_test.go index f6e32ee9e..e85fe4703 100644 --- a/internal/notifications/notifications_test.go +++ b/internal/notifications/notifications_test.go @@ -5,60 +5,6 @@ import ( "testing" ) -func TestValidateNotificationURL(t *testing.T) { - - tests := []struct { - name string - slackHookURL string - notifyURL string - expected string - }{ - { - "slackHookURL only works fine", - "https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET", - "", - "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET", - }, - { - "slackHookURL and notify URL together only keeps notifyURL", - "\"https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET\"", - "teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com", - "teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com", - }, - { - "slackHookURL removes extraneous double quotes", - "\"https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET\"", - "", - "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET", - }, - { - "slackHookURL removes extraneous single quotes", - "'https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET'", - "", - "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET", - }, - { - "notifyURL removes extraneous double quotes", - "", - "\"teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com\"", - "teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com", - }, - { - "notifyURL removes extraneous single quotes", - "", - "'teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com'", - "teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com", - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got := validateNotificationURL(tt.notifyURL, tt.slackHookURL); !reflect.DeepEqual(got, tt.expected) { - t.Errorf("validateNotificationURL() = %v, expected %v", got, tt.expected) - } - }) - } -} - func Test_stripQuotes(t *testing.T) { tests := []struct { name string @@ -85,6 +31,16 @@ func Test_stripQuotes(t *testing.T) { input: "'Hello, world!\"", expected: "'Hello, world!\"", }, + { + name: "string with length of two is stripped", + input: "\"\"", + expected: "", + }, + { + name: "string with length of two is stripped", + input: "''", + expected: "", + }, { name: "string with length of one is unchanged", input: "'", @@ -104,3 +60,68 @@ func Test_stripQuotes(t *testing.T) { }) } } + +func TestNewNotifier(t *testing.T) { + type args struct { + URLs []string + } + tests := []struct { + name string + args args + want Notifier + }{ + { + name: "No URLs means no notifier", + args: args{}, + want: &NoopNotifier{}, + }, + { + name: "Empty slice means no notifier", + args: args{URLs: []string{}}, + want: &NoopNotifier{}, + }, + { + name: "Empty string means no notifier", + args: args{URLs: []string{""}}, + want: &NoopNotifier{}, + }, + { + name: "Pseudo-Empty string means no notifier", + args: args{URLs: []string{"''"}}, + want: &NoopNotifier{}, + }, + { + name: "Pseudo-Empty string means no notifier", + args: args{URLs: []string{"\"\""}}, + want: &NoopNotifier{}, + }, + { + name: "Invalid string means no notifier", + args: args{URLs: []string{"'"}}, + want: &NoopNotifier{}, + }, + { + name: "Old shoutrrr slack urls are not valid anymore", + args: args{URLs: []string{"slack://xxxx/yyyy/zzzz"}}, + want: &NoopNotifier{}, + }, + { + name: "Valid slack bot API notifier url", + args: args{URLs: []string{"slack://xoxb:123456789012-1234567890123-4mt0t4l1YL3g1T5L4cK70k3N@C001CH4NN3L?color=good&title=Great+News&icon=man-scientist&botname=Shoutrrrbot"}}, + want: &ShoutrrrNotifier{}, + }, + { + name: "Valid slack webhook notifier url", + args: args{URLs: []string{"slack://hook:WNA3PBYV6-F20DUQND3RQ-Webc4MAvoacrpPakR8phF0zi@webhook?color=good&title=Great+News&icon=man-scientist&botname=Shoutrrrbot"}}, + want: &ShoutrrrNotifier{}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := NewNotifier(tt.args.URLs...) + if reflect.TypeOf(got) != reflect.TypeOf(tt.want) { + t.Errorf("NewNotifier() = %v, want %v", reflect.TypeOf(got), reflect.TypeOf(tt.want)) + } + }) + } +} From 4fb05dfaebab5ae3daf62d3801350460f204a891 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Mon, 11 Nov 2024 10:55:37 +0100 Subject: [PATCH 18/22] Replace annotation names Now that kured is a community project under the CNCF, it might be appropriate to use the official kured name, instead of the old weave.works name. As this is intrusive, there was no occasion to do it before. Because other commits in this branch are intrusive (change of main loop, removing old flags, ...) this is the occasion to clean up the cruft. Signed-off-by: Jean-Philippe Evrard --- cmd/kured/main.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cmd/kured/main.go b/cmd/kured/main.go index 09341dc50..ed462d4d3 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -93,11 +93,11 @@ var ( const ( // KuredNodeLockAnnotation is the canonical string value for the kured node-lock annotation - KuredNodeLockAnnotation string = "weave.works/kured-node-lock" + KuredNodeLockAnnotation string = "kured.dev/kured-node-lock" // KuredRebootInProgressAnnotation is the canonical string value for the kured reboot-in-progress annotation - KuredRebootInProgressAnnotation string = "weave.works/kured-reboot-in-progress" + KuredRebootInProgressAnnotation string = "kured.dev/kured-reboot-in-progress" // KuredMostRecentRebootNeededAnnotation is the canonical string value for the kured most-recent-reboot-needed annotation - KuredMostRecentRebootNeededAnnotation string = "weave.works/kured-most-recent-reboot-needed" + KuredMostRecentRebootNeededAnnotation string = "kured.dev/kured-most-recent-reboot-needed" // EnvPrefix The environment variable prefix of all environment variables bound to our command line flags. EnvPrefix = "KURED" // TODO: Replace this with runtime evaluation @@ -155,7 +155,7 @@ func main() { flag.StringVar(&rebootSentinelFile, "reboot-sentinel", "/var/run/reboot-required", "path to file whose existence triggers the reboot command") flag.StringVar(&preferNoScheduleTaintName, "prefer-no-schedule-taint", "", - "Taint name applied during pending node reboot (to prevent receiving additional pods from other rebooting nodes). Disabled by default. Set e.g. to \"weave.works/kured-node-reboot\" to enable tainting.") + "Taint name applied during pending node reboot (to prevent receiving additional pods from other rebooting nodes). Disabled by default. Set e.g. to \"kured.dev/kured-node-reboot\" to enable tainting.") flag.StringVar(&rebootSentinelCommand, "reboot-sentinel-command", "", "command for which a zero return code will trigger a reboot command") flag.StringVar(&rebootCommand, "reboot-command", "/bin/systemctl reboot", @@ -182,7 +182,7 @@ func main() { flag.StringVar(&timezone, "time-zone", "UTC", "use this timezone for schedule inputs") flag.BoolVar(&annotateNodes, "annotate-nodes", false, - "if set, the annotations 'weave.works/kured-reboot-in-progress' and 'weave.works/kured-most-recent-reboot-needed' will be given to nodes undergoing kured reboots") + "if set, the annotations 'kured.dev/kured-reboot-in-progress' and 'kured.dev/kured-most-recent-reboot-needed' will be given to nodes undergoing kured reboots") flag.StringVar(&logFormat, "log-format", "text", "use text or json log format") flag.StringSliceVar(&preRebootNodeLabels, "pre-reboot-node-labels", nil, From a42e54d73c63ba71348e11c43a80a5bb455cbd00 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Mon, 11 Nov 2024 11:05:09 +0100 Subject: [PATCH 19/22] Fix flags order Without this, the flags are becoming a mess. This cleans up the order. Not very important. Signed-off-by: Jean-Philippe Evrard --- cmd/kured/main.go | 177 ++++++++++++++++++---------------------------- 1 file changed, 69 insertions(+), 108 deletions(-) diff --git a/cmd/kured/main.go b/cmd/kured/main.go index ed462d4d3..ef60f78f7 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -35,48 +35,47 @@ import ( var ( version = "unreleased" - // Command line flags - forceReboot bool + // Command line flags (sorted alphabetically) + alertFilter regexpValue + alertFilterMatchOnly bool + alertFiringOnly bool + annotateNodes bool + concurrency int drainDelay time.Duration - drainTimeout time.Duration - rebootDelay time.Duration - rebootMethod string - period time.Duration - metricsHost string - metricsPort int drainGracePeriod int drainPodSelector string - skipWaitForDeleteTimeoutSeconds int - dsNamespace string + drainTimeout time.Duration dsName string + dsNamespace string lockAnnotation string - lockTTL time.Duration lockReleaseDelay time.Duration - prometheusURL string - preferNoScheduleTaintName string - alertFilter regexpValue - alertFilterMatchOnly bool - alertFiringOnly bool - rebootSentinelFile string - rebootSentinelCommand string - notifyURLs []string + lockTTL time.Duration + logFormat string messageTemplateDrain string messageTemplateReboot string messageTemplateUncordon string + metricsHost string + metricsPort int + nodeID string + notifyURLs []string + period time.Duration podSelectors []string + postRebootNodeLabels []string + preRebootNodeLabels []string + preferNoScheduleTaintName string + prometheusURL string rebootCommand string + rebootDays []string + rebootDelay time.Duration + rebootEnd string + rebootMethod string + rebootSentinelCommand string + rebootSentinelFile string rebootSignal int - logFormat string - preRebootNodeLabels []string - postRebootNodeLabels []string - nodeID string - concurrency int - - rebootDays []string - rebootStart string - rebootEnd string - timezone string - annotateNodes bool + rebootStart string + skipWaitForDeleteTimeoutSeconds int + timezone string + forceReboot bool // Metrics rebootRequiredGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{ @@ -110,85 +109,47 @@ func init() { func main() { - flag.StringVar(&nodeID, "node-id", "", - "node name kured runs on, should be passed down from spec.nodeName via KURED_NODE_ID environment variable") - flag.BoolVar(&forceReboot, "force-reboot", false, - "force a reboot even if the drain fails or times out") - flag.StringVar(&metricsHost, "metrics-host", "", - "host where metrics will listen") - flag.IntVar(&metricsPort, "metrics-port", 8080, - "port number where metrics will listen") - flag.IntVar(&drainGracePeriod, "drain-grace-period", -1, - "time in seconds given to each pod to terminate gracefully, if negative, the default value specified in the pod will be used") - flag.StringVar(&drainPodSelector, "drain-pod-selector", "", - "only drain pods with labels matching the selector (default: '', all pods)") - flag.IntVar(&skipWaitForDeleteTimeoutSeconds, "skip-wait-for-delete-timeout", 0, - "when seconds is greater than zero, skip waiting for the pods whose deletion timestamp is older than N seconds while draining a node") - flag.DurationVar(&drainDelay, "drain-delay", 0, - "delay drain for this duration (default: 0, disabled)") - flag.DurationVar(&drainTimeout, "drain-timeout", 0, - "timeout after which the drain is aborted (default: 0, infinite time)") - flag.DurationVar(&rebootDelay, "reboot-delay", 0, - "delay reboot for this duration (default: 0, disabled)") - flag.StringVar(&rebootMethod, "reboot-method", "command", - "method to use for reboots. Available: command") - flag.DurationVar(&period, "period", time.Minute, - "period at which the main operations are done") - flag.StringVar(&dsNamespace, "ds-namespace", "kube-system", - "namespace containing daemonset on which to place lock") - flag.StringVar(&dsName, "ds-name", "kured", - "name of daemonset on which to place lock") - flag.StringVar(&lockAnnotation, "lock-annotation", KuredNodeLockAnnotation, - "annotation in which to record locking node") - flag.DurationVar(&lockTTL, "lock-ttl", 0, - "expire lock annotation after this duration (default: 0, disabled)") - flag.DurationVar(&lockReleaseDelay, "lock-release-delay", 0, - "delay lock release for this duration (default: 0, disabled)") - flag.StringVar(&prometheusURL, "prometheus-url", "", - "Prometheus instance to probe for active alerts") - flag.Var(&alertFilter, "alert-filter-regexp", - "alert names to ignore when checking for active alerts") - flag.BoolVar(&alertFilterMatchOnly, "alert-filter-match-only", false, - "Only block if the alert-filter-regexp matches active alerts") - flag.BoolVar(&alertFiringOnly, "alert-firing-only", false, - "only consider firing alerts when checking for active alerts") - flag.StringVar(&rebootSentinelFile, "reboot-sentinel", "/var/run/reboot-required", - "path to file whose existence triggers the reboot command") - flag.StringVar(&preferNoScheduleTaintName, "prefer-no-schedule-taint", "", - "Taint name applied during pending node reboot (to prevent receiving additional pods from other rebooting nodes). Disabled by default. Set e.g. to \"kured.dev/kured-node-reboot\" to enable tainting.") - flag.StringVar(&rebootSentinelCommand, "reboot-sentinel-command", "", - "command for which a zero return code will trigger a reboot command") - flag.StringVar(&rebootCommand, "reboot-command", "/bin/systemctl reboot", - "command to run when a reboot is required") - flag.IntVar(&concurrency, "concurrency", 1, - "amount of nodes to concurrently reboot. Defaults to 1") - flag.IntVar(&rebootSignal, "reboot-signal", sigRTMinPlus5, - "signal to use for reboot, SIGRTMIN+5 by default.") + // flags are sorted alphabetically by type + flag.BoolVar(&alertFilterMatchOnly, "alert-filter-match-only", false, "Only block if the alert-filter-regexp matches active alerts") + flag.BoolVar(&alertFiringOnly, "alert-firing-only", false, "only consider firing alerts when checking for active alerts") + flag.BoolVar(&annotateNodes, "annotate-nodes", false, "if set, the annotations 'kured.dev/kured-reboot-in-progress' and 'kured.dev/kured-most-recent-reboot-needed' will be given to nodes undergoing kured reboots") + flag.BoolVar(&forceReboot, "force-reboot", false, "force a reboot even if the drain fails or times out") + flag.DurationVar(&drainDelay, "drain-delay", 0, "delay drain for this duration (default: 0, disabled)") + flag.DurationVar(&drainTimeout, "drain-timeout", 0, "timeout after which the drain is aborted (default: 0, infinite time)") + flag.DurationVar(&lockReleaseDelay, "lock-release-delay", 0, "delay lock release for this duration (default: 0, disabled)") + flag.DurationVar(&lockTTL, "lock-ttl", 0, "expire lock annotation after this duration (default: 0, disabled)") + flag.DurationVar(&period, "period", time.Minute, "period at which the main operations are done") + flag.DurationVar(&rebootDelay, "reboot-delay", 0, "delay reboot for this duration (default: 0, disabled)") + flag.IntVar(&concurrency, "concurrency", 1, "amount of nodes to concurrently reboot. Defaults to 1") + flag.IntVar(&drainGracePeriod, "drain-grace-period", -1, "time in seconds given to each pod to terminate gracefully, if negative, the default value specified in the pod will be used") + flag.IntVar(&metricsPort, "metrics-port", 8080, "port number where metrics will listen") + flag.IntVar(&rebootSignal, "reboot-signal", sigRTMinPlus5, "signal to use for reboot, SIGRTMIN+5 by default.") + flag.IntVar(&skipWaitForDeleteTimeoutSeconds, "skip-wait-for-delete-timeout", 0, "when seconds is greater than zero, skip waiting for the pods whose deletion timestamp is older than N seconds while draining a node") flag.StringArrayVar(¬ifyURLs, "notify-url", nil, "notify URL for reboot notifications (can be repeated for multiple notifications)") - flag.StringVar(&messageTemplateUncordon, "message-template-uncordon", "Node %s rebooted & uncordoned successfully!", - "message template used to notify about a node being successfully uncordoned") - flag.StringVar(&messageTemplateDrain, "message-template-drain", "Draining node %s", - "message template used to notify about a node being drained") - flag.StringVar(&messageTemplateReboot, "message-template-reboot", "Rebooting node %s", - "message template used to notify about a node being rebooted") - flag.StringArrayVar(&podSelectors, "blocking-pod-selector", nil, - "label selector identifying pods whose presence should prevent reboots") - flag.StringSliceVar(&rebootDays, "reboot-days", timewindow.EveryDay, - "schedule reboot on these days") - flag.StringVar(&rebootStart, "start-time", "0:00", - "schedule reboot only after this time of day") - flag.StringVar(&rebootEnd, "end-time", "23:59:59", - "schedule reboot only before this time of day") - flag.StringVar(&timezone, "time-zone", "UTC", - "use this timezone for schedule inputs") - flag.BoolVar(&annotateNodes, "annotate-nodes", false, - "if set, the annotations 'kured.dev/kured-reboot-in-progress' and 'kured.dev/kured-most-recent-reboot-needed' will be given to nodes undergoing kured reboots") - flag.StringVar(&logFormat, "log-format", "text", - "use text or json log format") - flag.StringSliceVar(&preRebootNodeLabels, "pre-reboot-node-labels", nil, - "labels to add to nodes before cordoning") - flag.StringSliceVar(&postRebootNodeLabels, "post-reboot-node-labels", nil, - "labels to add to nodes after uncordoning") + flag.StringArrayVar(&podSelectors, "blocking-pod-selector", nil, "label selector identifying pods whose presence should prevent reboots") + flag.StringSliceVar(&postRebootNodeLabels, "post-reboot-node-labels", nil, "labels to add to nodes after uncordoning") + flag.StringSliceVar(&preRebootNodeLabels, "pre-reboot-node-labels", nil, "labels to add to nodes before cordoning") + flag.StringSliceVar(&rebootDays, "reboot-days", timewindow.EveryDay, "schedule reboot on these days") + flag.StringVar(&drainPodSelector, "drain-pod-selector", "", "only drain pods with labels matching the selector (default: '', all pods)") + flag.StringVar(&dsName, "ds-name", "kured", "name of daemonset on which to place lock") + flag.StringVar(&dsNamespace, "ds-namespace", "kube-system", "namespace containing daemonset on which to place lock") + flag.StringVar(&lockAnnotation, "lock-annotation", KuredNodeLockAnnotation, "annotation in which to record locking node") + flag.StringVar(&logFormat, "log-format", "text", "use text or json log format") + flag.StringVar(&messageTemplateDrain, "message-template-drain", "Draining node %s", "message template used to notify about a node being drained") + flag.StringVar(&messageTemplateReboot, "message-template-reboot", "Rebooting node %s", "message template used to notify about a node being rebooted") + flag.StringVar(&messageTemplateUncordon, "message-template-uncordon", "Node %s rebooted & uncordoned successfully!", "message template used to notify about a node being successfully uncordoned") + flag.StringVar(&metricsHost, "metrics-host", "", "host where metrics will listen") + flag.StringVar(&nodeID, "node-id", "", "node name kured runs on, should be passed down from spec.nodeName via KURED_NODE_ID environment variable") + flag.StringVar(&preferNoScheduleTaintName, "prefer-no-schedule-taint", "", "Taint name applied during pending node reboot (to prevent receiving additional pods from other rebooting nodes). Disabled by default. Set e.g. to \"kured.dev/kured-node-reboot\" to enable tainting.") + flag.StringVar(&prometheusURL, "prometheus-url", "", "Prometheus instance to probe for active alerts") + flag.StringVar(&rebootCommand, "reboot-command", "/bin/systemctl reboot", "command to run when a reboot is required") + flag.StringVar(&rebootEnd, "end-time", "23:59:59", "schedule reboot only before this time of day") + flag.StringVar(&rebootMethod, "reboot-method", "command", "method to use for reboots. Available: command") + flag.StringVar(&rebootSentinelCommand, "reboot-sentinel-command", "", "command for which a zero return code will trigger a reboot command") + flag.StringVar(&rebootSentinelFile, "reboot-sentinel", "/var/run/reboot-required", "path to file whose existence triggers the reboot command") + flag.StringVar(&rebootStart, "start-time", "0:00", "schedule reboot only after this time of day") + flag.StringVar(&timezone, "time-zone", "UTC", "use this timezone for schedule inputs") + flag.Var(&alertFilter, "alert-filter-regexp", "alert names to ignore when checking for active alerts") flag.Parse() From 1aee6b28d3667ce35faf187a06a7676df21aa6f3 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Mon, 11 Nov 2024 13:08:57 +0100 Subject: [PATCH 20/22] Use node annotation for previous state Without this, we use the daemonset lock for annotating the state of the node. This is unnecessary, as we have rights on the node. This is a problem, as it makes the whole lock model more complex, and prevents other implementations for lock. This fixes it by adapting the drain/uncordon functions to rely on the node annotations, by introducing a new annotation, named "kured.dev/node-unschedulable-before-drain" Signed-off-by: Jean-Philippe Evrard --- cmd/kured/main.go | 155 +++++++++------- internal/daemonsetlock/daemonsetlock.go | 186 +++++++++---------- internal/daemonsetlock/daemonsetlock_test.go | 4 +- 3 files changed, 183 insertions(+), 162 deletions(-) diff --git a/cmd/kured/main.go b/cmd/kured/main.go index ef60f78f7..4846c6f76 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -39,7 +39,7 @@ var ( alertFilter regexpValue alertFilterMatchOnly bool alertFiringOnly bool - annotateNodes bool + annotateNodeProgress bool concurrency int drainDelay time.Duration drainGracePeriod int @@ -92,7 +92,8 @@ var ( const ( // KuredNodeLockAnnotation is the canonical string value for the kured node-lock annotation - KuredNodeLockAnnotation string = "kured.dev/kured-node-lock" + KuredNodeLockAnnotation string = "kured.dev/kured-node-lock" + KuredNodeWasUnschedulableBeforeDrainAnnotation string = "kured.dev/node-unschedulable-before-drain" // KuredRebootInProgressAnnotation is the canonical string value for the kured reboot-in-progress annotation KuredRebootInProgressAnnotation string = "kured.dev/kured-reboot-in-progress" // KuredMostRecentRebootNeededAnnotation is the canonical string value for the kured most-recent-reboot-needed annotation @@ -112,7 +113,7 @@ func main() { // flags are sorted alphabetically by type flag.BoolVar(&alertFilterMatchOnly, "alert-filter-match-only", false, "Only block if the alert-filter-regexp matches active alerts") flag.BoolVar(&alertFiringOnly, "alert-firing-only", false, "only consider firing alerts when checking for active alerts") - flag.BoolVar(&annotateNodes, "annotate-nodes", false, "if set, the annotations 'kured.dev/kured-reboot-in-progress' and 'kured.dev/kured-most-recent-reboot-needed' will be given to nodes undergoing kured reboots") + flag.BoolVar(&annotateNodeProgress, "annotate-nodes", false, "if set, the annotations 'kured.dev/kured-reboot-in-progress' and 'kured.dev/kured-most-recent-reboot-needed' will be given to nodes undergoing kured reboots") flag.BoolVar(&forceReboot, "force-reboot", false, "force a reboot even if the drain fails or times out") flag.DurationVar(&drainDelay, "drain-delay", 0, "delay drain for this duration (default: 0, disabled)") flag.DurationVar(&drainTimeout, "drain-timeout", 0, "timeout after which the drain is aborted (default: 0, infinite time)") @@ -199,8 +200,8 @@ func main() { slog.Info(fmt.Sprintf("preferNoSchedule taint: (%s)", preferNoScheduleTaintName), "node", nodeID) - if annotateNodes { - slog.Info("Will annotate nodes during kured reboot operations", "node", nodeID) + if annotateNodeProgress { + slog.Info("Will annotate nodes progress during kured reboot operations", "node", nodeID) } rebooter, err := reboot.NewRebooter(rebootMethod, rebootCommand, rebootSignal, rebootDelay, true, 1) @@ -390,6 +391,20 @@ func drain(client *kubernetes.Clientset, node *v1.Node, notifier notifications.N Timeout: drainTimeout, } + // Add previous state of the node Spec.Unschedulable into an annotation + // If an annotation was present, it means that either the cordon or drain failed, + // hence it does not need to reapply: It might override what the user has set + // (for example if the cordon succeeded but the drain failed) + if _, ok := node.Annotations[KuredNodeWasUnschedulableBeforeDrainAnnotation]; !ok { + // Store State of the node before cordon changes it + annotations := map[string]string{KuredNodeWasUnschedulableBeforeDrainAnnotation: strconv.FormatBool(node.Spec.Unschedulable)} + // & annotate this node with a timestamp so that other node maintenance tools know how long it's been since this node has been marked for reboot + err := addNodeAnnotations(client, nodeID, annotations) + if err != nil { + return fmt.Errorf("error saving state of the node %s, %v", nodename, err) + } + } + if err := kubectldrain.RunCordonOrUncordon(drainer, node, true); err != nil { return fmt.Errorf("error cordonning node %s, %v", nodename, err) } @@ -400,8 +415,30 @@ func drain(client *kubernetes.Clientset, node *v1.Node, notifier notifications.N return nil } -func uncordon(client *kubernetes.Clientset, node *v1.Node) error { - nodename := node.GetName() +func uncordon(client *kubernetes.Clientset, node *v1.Node, notifier notifications.Notifier) error { + // Revert cordon spec change with the help of node annotation + annotationContent, ok := node.Annotations[KuredNodeWasUnschedulableBeforeDrainAnnotation] + if !ok { + // If no node annotations, uncordon will not act. + // Do not uncordon if you do not know previous state, it could bring nodes under maintenance online! + return nil + } + + wasUnschedulable, err := strconv.ParseBool(annotationContent) + if err != nil { + return fmt.Errorf("annotation was edited and cannot be converted back to bool %v, cannot uncordon (unrecoverable)", err) + } + + if wasUnschedulable { + // Just delete the annotation, keep Cordonned + err := deleteNodeAnnotation(client, nodeID, KuredNodeWasUnschedulableBeforeDrainAnnotation) + if err != nil { + return fmt.Errorf("error removing the WasUnschedulable annotation, keeping the node stuck in cordonned state forever %v", err) + } + return nil + } + + nodeName := node.GetName() kubectlStdOutLogger := &slogWriter{message: "uncordon: results", stream: "stdout"} kubectlStdErrLogger := &slogWriter{message: "uncordon: results", stream: "stderr"} @@ -412,11 +449,17 @@ func uncordon(client *kubernetes.Clientset, node *v1.Node) error { Ctx: context.Background(), } if err := kubectldrain.RunCordonOrUncordon(drainer, node, false); err != nil { - return fmt.Errorf("error uncordonning node %s: %v", nodename, err) + return fmt.Errorf("error uncordonning node %s: %v", nodeName, err) } else if postRebootNodeLabels != nil { err := updateNodeLabels(client, node, postRebootNodeLabels) - return fmt.Errorf("error updating node (%s) labels, needs manual intervention %v", nodename, err) + return fmt.Errorf("error updating node (%s) labels, needs manual intervention %v", nodeName, err) } + + err = deleteNodeAnnotation(client, nodeID, KuredNodeWasUnschedulableBeforeDrainAnnotation) + if err != nil { + return fmt.Errorf("error removing the WasUnschedulable annotation, keeping the node stuck in current state forever %v", err) + } + notifier.Send(fmt.Sprintf(messageTemplateUncordon, nodeID), "Node uncordonned successfully") return nil } @@ -516,44 +559,13 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. continue } - // Get lock data to know if need to uncordon the node - // to get the node back to its previous state - // TODO: Need to move to another method to check the current data of the lock relevant for this node - holding, lockData, err := lock.Holding() + err = uncordon(client, node, notifier) if err != nil { - // Internal wiring, admin does not need to be aware unless debugging session - slog.Debug(fmt.Sprintf("Error checking lock - Not applying any action: %v.\nPlease check API", err), "node", nodeID, "error", err) + // Might be a transient API issue or a real problem. Inform the admin + slog.Info("unable to uncordon needs investigation", "node", nodeID, "error", err) continue } - // we check if holding ONLY to know if lockData is valid. - // When moving to fetch lockData as a separate method, remove - // this whole condition. - // However, it means that Release() - // need to behave idempotently regardless or not the lock is - // held, but that's an ideal state. - // what we should simply do is reconcile the lock data - // with the node spec. But behind the scenes its a bug - // if it's not holding due to an error - if holding && !lockData.Metadata.Unschedulable { - // Split into two lines to remember I need to remove the first - // condition ;) - if node.Spec.Unschedulable != lockData.Metadata.Unschedulable && lockData.Metadata.Unschedulable == false { - // Important lifecycle event - admin should be aware - slog.Info("uncordoning node", "node", nodeID) - err = uncordon(client, node) - if err != nil { - // Transient API issue - no problem, will retry. No need to worry the admin for this - slog.Debug(fmt.Sprintf("Unable to uncordon %s: %v, will continue to hold lock and retry uncordon", node.GetName(), err), "node", nodeID, "error", err) - continue - } - // TODO, modify the actions to directly log and notify, instead of individual methods giving - // an incomplete view of the lifecycle - notifier.Send(fmt.Sprintf(messageTemplateUncordon, nodeID), "Node uncordonned") - } - - } - // Releasing lock earlier is nice for other nodes err = lock.Release() if err != nil { @@ -563,7 +575,7 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. } // Regardless or not we are holding the lock // The node should not say it's still in progress if the reboot is done - if annotateNodes { + if annotateNodeProgress { if _, ok := node.Annotations[KuredRebootInProgressAnnotation]; ok { // Who reads this? I hope nobody bothers outside real debug cases slog.Debug(fmt.Sprintf("Deleting node %s annotation %s", nodeID, KuredRebootInProgressAnnotation), "node", nodeID) @@ -578,6 +590,7 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. rebootRequiredGauge.WithLabelValues(nodeID).Set(1) // Act on reboot required. + if !window.Contains(time.Now()) { // Probably spamming outside the maintenance window. This should not be logged as info slog.Debug("reboot required but outside maintenance window", "node", nodeID) @@ -602,11 +615,11 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. slog.Debug("error retrieving node object via k8s API, retrying at next tick", "node", nodeID, "error", err) continue } - // nodeMeta contains the node Metadata that should be included in the lock - nodeMeta := daemonsetlock.NodeMeta{Unschedulable: node.Spec.Unschedulable} + //// nodeMeta contains the node Metadata that should be included in the lock + //nodeMeta := daemonsetlock.NodeMeta{Unschedulable: node.Spec.Unschedulable} var timeNowString string - if annotateNodes { + if annotateNodeProgress { if _, ok := node.Annotations[KuredRebootInProgressAnnotation]; !ok { timeNowString = time.Now().Format(time.RFC3339) // Annotate this node to indicate that "I am going to be rebooted!" @@ -624,27 +637,32 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. // Prefer to not schedule pods onto this node to avoid draing the same pod multiple times. preferNoScheduleTaint.Enable() - // This could be merged into a single idempotent "Acquire" lock - holding, _, err := lock.Holding() - if err != nil { - // Not important to worry the admin - slog.Debug("error testing lock", "node", nodeID, "error", err) + holding, err := lock.Acquire() + if err != nil || !holding { + slog.Debug("error acquiring lock, will retry at next tick", "node", nodeID, "error", err) continue } - - if !holding { - acquired, holder, err := lock.Acquire(nodeMeta) - if err != nil { - // Not important to worry the admin - slog.Debug("error acquiring lock, will retry at next tick", "node", nodeID, "error", err) - continue - } - if !acquired { - // Not very important - lock prevents action - slog.Debug(fmt.Sprintf("Lock already held by %v, will retry at next tick", holder), "node", nodeID) - continue - } - } + //// This could be merged into a single idempotent "Acquire" lock + //holding, _, err := lock.Holding() + //if err != nil { + // // Not important to worry the admin + // slog.Debug("error testing lock", "node", nodeID, "error", err) + // continue + //} + // + //if !holding { + // acquired, holder, err := lock.Acquire(nodeMeta) + // if err != nil { + // // Not important to worry the admin + // slog.Debug("error acquiring lock, will retry at next tick", "node", nodeID, "error", err) + // continue + // } + // if !acquired { + // // Not very important - lock prevents action + // slog.Debug(fmt.Sprintf("Lock already held by %v, will retry at next tick", holder), "node", nodeID) + // continue + // } + //} err = drain(client, node, notifier) if err != nil { @@ -659,7 +677,10 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. // If shown, it is helping understand the uncordonning. If the admin seems the node as cordonned // with this, it needs to take action (for example if the node was previously cordonned!) slog.Info("Performing a best-effort uncordon after failed cordon and drain", "node", nodeID) - uncordon(client, node) + err := uncordon(client, node, notifier) + if err != nil { + slog.Info("Uncordon failed", "error", err) + } continue } } diff --git a/internal/daemonsetlock/daemonsetlock.go b/internal/daemonsetlock/daemonsetlock.go index 54a698613..3df0aab13 100644 --- a/internal/daemonsetlock/daemonsetlock.go +++ b/internal/daemonsetlock/daemonsetlock.go @@ -5,7 +5,6 @@ import ( "encoding/json" "fmt" "log/slog" - "strings" "time" v1 "k8s.io/api/apps/v1" @@ -21,9 +20,8 @@ const ( ) type Lock interface { - Acquire(NodeMeta) (bool, string, error) + Acquire() (bool, error) Release() error - Holding() (bool, LockAnnotationValue, error) } type GenericLock struct { @@ -62,9 +60,7 @@ type DaemonSetMultiLock struct { } // LockAnnotationValue contains the lock data, -// which allows persistence across reboots, particularily recording if the -// node was already unschedulable before kured reboot. -// To be modified when using another type of lock storage. +// Metadata is there for migrations reasons. Ignore this as much as possible type LockAnnotationValue struct { NodeID string `json:"nodeID"` Metadata NodeMeta `json:"metadata,omitempty"` @@ -128,32 +124,32 @@ func (dsl *DaemonSetLock) GetDaemonSet(sleep, timeout time.Duration) (*v1.Daemon } // Acquire attempts to annotate the kured daemonset with lock info from instantiated DaemonSetLock using client-go -func (dsl *DaemonSetSingleLock) Acquire(nodeMetadata NodeMeta) (bool, string, error) { +func (dsl *DaemonSetSingleLock) Acquire() (bool, error) { for { ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout) if err != nil { - return false, "", fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err) + return false, fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err) } valueString, exists := ds.ObjectMeta.Annotations[dsl.annotation] if exists { value := LockAnnotationValue{} if err := json.Unmarshal([]byte(valueString), &value); err != nil { - return false, "", err + return false, err } if !ttlExpired(value.Created, value.TTL) { - return value.NodeID == dsl.nodeID, value.NodeID, nil + return value.NodeID == dsl.nodeID, nil } } if ds.ObjectMeta.Annotations == nil { ds.ObjectMeta.Annotations = make(map[string]string) } - value := LockAnnotationValue{NodeID: dsl.nodeID, Metadata: nodeMetadata, Created: time.Now().UTC(), TTL: dsl.TTL} + value := LockAnnotationValue{NodeID: dsl.nodeID, Created: time.Now().UTC(), TTL: dsl.TTL} valueBytes, err := json.Marshal(&value) if err != nil { - return false, "", err + return false, err } ds.ObjectMeta.Annotations[dsl.annotation] = string(valueBytes) @@ -164,35 +160,36 @@ func (dsl *DaemonSetSingleLock) Acquire(nodeMetadata NodeMeta) (bool, string, er time.Sleep(time.Second) continue } else { - return false, "", err + return false, err } } - return true, dsl.nodeID, nil + return true, nil } } -// Test attempts to check the kured daemonset lock status (existence, expiry) from instantiated DaemonSetLock using client-go -func (dsl *DaemonSetSingleLock) Holding() (bool, LockAnnotationValue, error) { - var lockData LockAnnotationValue - ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout) - if err != nil { - return false, lockData, fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err) - } - - valueString, exists := ds.ObjectMeta.Annotations[dsl.annotation] - if exists { - value := LockAnnotationValue{} - if err := json.Unmarshal([]byte(valueString), &value); err != nil { - return false, lockData, err - } - - if !ttlExpired(value.Created, value.TTL) { - return value.NodeID == dsl.nodeID, value, nil - } - } - - return false, lockData, nil -} +// +//// Test attempts to check the kured daemonset lock status (existence, expiry) from instantiated DaemonSetLock using client-go +//func (dsl *DaemonSetSingleLock) Holding() (bool, LockAnnotationValue, error) { +// var lockData LockAnnotationValue +// ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout) +// if err != nil { +// return false, lockData, fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err) +// } +// +// valueString, exists := ds.ObjectMeta.Annotations[dsl.annotation] +// if exists { +// value := LockAnnotationValue{} +// if err := json.Unmarshal([]byte(valueString), &value); err != nil { +// return false, lockData, err +// } +// +// if !ttlExpired(value.Created, value.TTL) { +// return value.NodeID == dsl.nodeID, value, nil +// } +// } +// +// return false, lockData, nil +//} // Release attempts to remove the lock data from the kured ds annotations using client-go func (dsl *DaemonSetSingleLock) Release() error { @@ -207,17 +204,18 @@ func (dsl *DaemonSetSingleLock) Release() error { } valueString, exists := ds.ObjectMeta.Annotations[dsl.annotation] - if exists { - value := LockAnnotationValue{} - if err := json.Unmarshal([]byte(valueString), &value); err != nil { - return err - } - if value.NodeID != dsl.nodeID { - return fmt.Errorf("not lock holder: %v", value.NodeID) - } - } else { - return fmt.Errorf("lock not held") + if !exists { + return nil + } + + value := LockAnnotationValue{} + if err := json.Unmarshal([]byte(valueString), &value); err != nil { + return err + } + + if value.NodeID != dsl.nodeID { + return fmt.Errorf("not lock holder: %v", value.NodeID) } delete(ds.ObjectMeta.Annotations, dsl.annotation) @@ -243,15 +241,15 @@ func ttlExpired(created time.Time, ttl time.Duration) bool { return false } -func nodeIDsFromMultiLock(annotation multiLockAnnotationValue) []string { - nodeIDs := make([]string, 0, len(annotation.LockAnnotations)) - for _, nodeLock := range annotation.LockAnnotations { - nodeIDs = append(nodeIDs, nodeLock.NodeID) - } - return nodeIDs -} +//func nodeIDsFromMultiLock(annotation multiLockAnnotationValue) []string { +// nodeIDs := make([]string, 0, len(annotation.LockAnnotations)) +// for _, nodeLock := range annotation.LockAnnotations { +// nodeIDs = append(nodeIDs, nodeLock.NodeID) +// } +// return nodeIDs +//} -func (dsl *DaemonSetLock) canAcquireMultiple(annotation multiLockAnnotationValue, metadata NodeMeta, TTL time.Duration, maxOwners int) (bool, multiLockAnnotationValue) { +func (dsl *DaemonSetLock) canAcquireMultiple(annotation multiLockAnnotationValue, TTL time.Duration, maxOwners int) (bool, multiLockAnnotationValue) { newAnnotation := multiLockAnnotationValue{MaxOwners: maxOwners} freeSpace := false if annotation.LockAnnotations == nil || len(annotation.LockAnnotations) < maxOwners { @@ -274,10 +272,9 @@ func (dsl *DaemonSetLock) canAcquireMultiple(annotation multiLockAnnotationValue newAnnotation.LockAnnotations = append( newAnnotation.LockAnnotations, LockAnnotationValue{ - NodeID: dsl.nodeID, - Metadata: metadata, - Created: time.Now().UTC(), - TTL: TTL, + NodeID: dsl.nodeID, + Created: time.Now().UTC(), + TTL: TTL, }, ) return true, newAnnotation @@ -287,24 +284,24 @@ func (dsl *DaemonSetLock) canAcquireMultiple(annotation multiLockAnnotationValue } // Acquire creates and annotates the daemonset with a multiple owner lock -func (dsl *DaemonSetMultiLock) Acquire(nodeMetaData NodeMeta) (bool, string, error) { +func (dsl *DaemonSetMultiLock) Acquire() (bool, error) { for { ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout) if err != nil { - return false, "", fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err) + return false, fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err) } annotation := multiLockAnnotationValue{} valueString, exists := ds.ObjectMeta.Annotations[dsl.annotation] if exists { if err := json.Unmarshal([]byte(valueString), &annotation); err != nil { - return false, "", fmt.Errorf("error getting multi lock: %w", err) + return false, fmt.Errorf("error getting multi lock: %w", err) } } - lockPossible, newAnnotation := dsl.canAcquireMultiple(annotation, nodeMetaData, dsl.TTL, dsl.maxOwners) + lockPossible, newAnnotation := dsl.canAcquireMultiple(annotation, dsl.TTL, dsl.maxOwners) if !lockPossible { - return false, strings.Join(nodeIDsFromMultiLock(newAnnotation), ","), nil + return false, nil } if ds.ObjectMeta.Annotations == nil { @@ -312,7 +309,7 @@ func (dsl *DaemonSetMultiLock) Acquire(nodeMetaData NodeMeta) (bool, string, err } newAnnotationBytes, err := json.Marshal(&newAnnotation) if err != nil { - return false, "", fmt.Errorf("error marshalling new annotation lock: %w", err) + return false, fmt.Errorf("error marshalling new annotation lock: %w", err) } ds.ObjectMeta.Annotations[dsl.annotation] = string(newAnnotationBytes) @@ -322,37 +319,38 @@ func (dsl *DaemonSetMultiLock) Acquire(nodeMetaData NodeMeta) (bool, string, err time.Sleep(time.Second) continue } else { - return false, "", fmt.Errorf("error updating daemonset with multi lock: %w", err) + return false, fmt.Errorf("error updating daemonset with multi lock: %w", err) } } - return true, strings.Join(nodeIDsFromMultiLock(newAnnotation), ","), nil + return true, nil } } -// TestMultiple attempts to check the kured daemonset lock status for multi locks -func (dsl *DaemonSetMultiLock) Holding() (bool, LockAnnotationValue, error) { - var lockdata LockAnnotationValue - ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout) - if err != nil { - return false, lockdata, fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err) - } - - valueString, exists := ds.ObjectMeta.Annotations[dsl.annotation] - if exists { - value := multiLockAnnotationValue{} - if err := json.Unmarshal([]byte(valueString), &value); err != nil { - return false, lockdata, err - } - - for _, nodeLock := range value.LockAnnotations { - if nodeLock.NodeID == dsl.nodeID && !ttlExpired(nodeLock.Created, nodeLock.TTL) { - return true, nodeLock, nil - } - } - } - - return false, lockdata, nil -} +// +//// TestMultiple attempts to check the kured daemonset lock status for multi locks +//func (dsl *DaemonSetMultiLock) Holding() (bool, LockAnnotationValue, error) { +// var lockdata LockAnnotationValue +// ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout) +// if err != nil { +// return false, lockdata, fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err) +// } +// +// valueString, exists := ds.ObjectMeta.Annotations[dsl.annotation] +// if exists { +// value := multiLockAnnotationValue{} +// if err := json.Unmarshal([]byte(valueString), &value); err != nil { +// return false, lockdata, err +// } +// +// for _, nodeLock := range value.LockAnnotations { +// if nodeLock.NodeID == dsl.nodeID && !ttlExpired(nodeLock.Created, nodeLock.TTL) { +// return true, nodeLock, nil +// } +// } +// } +// +// return false, lockdata, nil +//} // Release attempts to remove the lock data for a single node from the multi node annotation func (dsl *DaemonSetMultiLock) Release() error { @@ -382,9 +380,11 @@ func (dsl *DaemonSetMultiLock) Release() error { } } } - - if !exists || !modified { - return fmt.Errorf("Lock not held") + if !exists { + return nil + } + if !modified { + return fmt.Errorf("lock not held") } newAnnotationBytes, err := json.Marshal(value) diff --git a/internal/daemonsetlock/daemonsetlock_test.go b/internal/daemonsetlock/daemonsetlock_test.go index f68a81e28..4b931776e 100644 --- a/internal/daemonsetlock/daemonsetlock_test.go +++ b/internal/daemonsetlock/daemonsetlock_test.go @@ -183,10 +183,10 @@ func TestCanAcquireMultiple(t *testing.T) { lockPossible: true, }, } - nm := NodeMeta{Unschedulable: false} + for _, testCase := range testCases { t.Run(testCase.name, func(t *testing.T) { - lockPossible, actual := testCase.daemonSetLock.canAcquireMultiple(testCase.current, nm, time.Minute, testCase.maxOwners) + lockPossible, actual := testCase.daemonSetLock.canAcquireMultiple(testCase.current, time.Minute, testCase.maxOwners) if lockPossible != testCase.lockPossible { t.Fatalf( "unexpected result for lock possible (got %t expected %t new annotation %v", From 4687d88b2b1d730a2c1c50f22b1c9d209258354b Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Mon, 11 Nov 2024 19:35:35 +0100 Subject: [PATCH 21/22] Extract operations into internal pkg This reduces the global variables, and regroups all the operations in a single place. This will allow further refactor to represent all the k8s operations kured needs on a single node. Signed-off-by: Jean-Philippe Evrard --- cmd/kured/main.go | 214 +------------------ internal/k8soperations/annotations.go | 86 ++++++++ internal/k8soperations/drain.go | 124 +++++++++++ internal/k8soperations/logging.go | 19 ++ internal/{taints => k8soperations}/taints.go | 6 +- 5 files changed, 241 insertions(+), 208 deletions(-) create mode 100644 internal/k8soperations/annotations.go create mode 100644 internal/k8soperations/drain.go create mode 100644 internal/k8soperations/logging.go rename internal/{taints => k8soperations}/taints.go (96%) diff --git a/cmd/kured/main.go b/cmd/kured/main.go index 4846c6f76..db3237f5d 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -2,11 +2,10 @@ package main import ( "context" - "encoding/json" "fmt" "github.com/kubereboot/kured/internal/daemonsetlock" + "github.com/kubereboot/kured/internal/k8soperations" "github.com/kubereboot/kured/internal/notifications" - "github.com/kubereboot/kured/internal/taints" "github.com/kubereboot/kured/internal/timewindow" "github.com/kubereboot/kured/pkg/blockers" "github.com/kubereboot/kured/pkg/checkers" @@ -17,10 +16,8 @@ import ( flag "github.com/spf13/pflag" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" - kubectldrain "k8s.io/kubectl/pkg/drain" "log" "log/slog" "net/http" @@ -92,8 +89,7 @@ var ( const ( // KuredNodeLockAnnotation is the canonical string value for the kured node-lock annotation - KuredNodeLockAnnotation string = "kured.dev/kured-node-lock" - KuredNodeWasUnschedulableBeforeDrainAnnotation string = "kured.dev/node-unschedulable-before-drain" + KuredNodeLockAnnotation string = "kured.dev/kured-node-lock" // KuredRebootInProgressAnnotation is the canonical string value for the kured reboot-in-progress annotation KuredRebootInProgressAnnotation string = "kured.dev/kured-reboot-in-progress" // KuredMostRecentRebootNeededAnnotation is the canonical string value for the kured most-recent-reboot-needed annotation @@ -339,202 +335,9 @@ func LoadFromEnv() { } -type slogWriter struct { - stream string - message string -} - -func (sw slogWriter) Write(p []byte) (n int, err error) { - output := string(p) - switch sw.stream { - case "stdout": - slog.Info(sw.message, "node", nodeID, "stdout", output) - case "stderr": - slog.Info(sw.message, "node", nodeID, "stderr", output) - } - return len(p), nil -} - -func drain(client *kubernetes.Clientset, node *v1.Node, notifier notifications.Notifier) error { - nodename := node.GetName() - - if preRebootNodeLabels != nil { - err := updateNodeLabels(client, node, preRebootNodeLabels) - if err != nil { - return fmt.Errorf("stopping drain due to problem with node labels %v", err) - } - } - - if drainDelay > 0 { - slog.Debug("Delaying drain", "delay", drainDelay, "node", nodename) - time.Sleep(drainDelay) - } - - slog.Info("Starting drain", "node", nodename) - - notifier.Send(fmt.Sprintf(messageTemplateDrain, nodename), "Starting drain") - - kubectlStdOutLogger := &slogWriter{message: "draining: results", stream: "stdout"} - kubectlStdErrLogger := &slogWriter{message: "draining: results", stream: "stderr"} - - drainer := &kubectldrain.Helper{ - Client: client, - Ctx: context.Background(), - GracePeriodSeconds: drainGracePeriod, - PodSelector: drainPodSelector, - SkipWaitForDeleteTimeoutSeconds: skipWaitForDeleteTimeoutSeconds, - Force: true, - DeleteEmptyDirData: true, - IgnoreAllDaemonSets: true, - ErrOut: kubectlStdErrLogger, - Out: kubectlStdOutLogger, - Timeout: drainTimeout, - } - - // Add previous state of the node Spec.Unschedulable into an annotation - // If an annotation was present, it means that either the cordon or drain failed, - // hence it does not need to reapply: It might override what the user has set - // (for example if the cordon succeeded but the drain failed) - if _, ok := node.Annotations[KuredNodeWasUnschedulableBeforeDrainAnnotation]; !ok { - // Store State of the node before cordon changes it - annotations := map[string]string{KuredNodeWasUnschedulableBeforeDrainAnnotation: strconv.FormatBool(node.Spec.Unschedulable)} - // & annotate this node with a timestamp so that other node maintenance tools know how long it's been since this node has been marked for reboot - err := addNodeAnnotations(client, nodeID, annotations) - if err != nil { - return fmt.Errorf("error saving state of the node %s, %v", nodename, err) - } - } - - if err := kubectldrain.RunCordonOrUncordon(drainer, node, true); err != nil { - return fmt.Errorf("error cordonning node %s, %v", nodename, err) - } - - if err := kubectldrain.RunNodeDrain(drainer, nodename); err != nil { - return fmt.Errorf("error draining node %s: %v", nodename, err) - } - return nil -} - -func uncordon(client *kubernetes.Clientset, node *v1.Node, notifier notifications.Notifier) error { - // Revert cordon spec change with the help of node annotation - annotationContent, ok := node.Annotations[KuredNodeWasUnschedulableBeforeDrainAnnotation] - if !ok { - // If no node annotations, uncordon will not act. - // Do not uncordon if you do not know previous state, it could bring nodes under maintenance online! - return nil - } - - wasUnschedulable, err := strconv.ParseBool(annotationContent) - if err != nil { - return fmt.Errorf("annotation was edited and cannot be converted back to bool %v, cannot uncordon (unrecoverable)", err) - } - - if wasUnschedulable { - // Just delete the annotation, keep Cordonned - err := deleteNodeAnnotation(client, nodeID, KuredNodeWasUnschedulableBeforeDrainAnnotation) - if err != nil { - return fmt.Errorf("error removing the WasUnschedulable annotation, keeping the node stuck in cordonned state forever %v", err) - } - return nil - } - - nodeName := node.GetName() - kubectlStdOutLogger := &slogWriter{message: "uncordon: results", stream: "stdout"} - kubectlStdErrLogger := &slogWriter{message: "uncordon: results", stream: "stderr"} - - drainer := &kubectldrain.Helper{ - Client: client, - ErrOut: kubectlStdErrLogger, - Out: kubectlStdOutLogger, - Ctx: context.Background(), - } - if err := kubectldrain.RunCordonOrUncordon(drainer, node, false); err != nil { - return fmt.Errorf("error uncordonning node %s: %v", nodeName, err) - } else if postRebootNodeLabels != nil { - err := updateNodeLabels(client, node, postRebootNodeLabels) - return fmt.Errorf("error updating node (%s) labels, needs manual intervention %v", nodeName, err) - } - - err = deleteNodeAnnotation(client, nodeID, KuredNodeWasUnschedulableBeforeDrainAnnotation) - if err != nil { - return fmt.Errorf("error removing the WasUnschedulable annotation, keeping the node stuck in current state forever %v", err) - } - notifier.Send(fmt.Sprintf(messageTemplateUncordon, nodeID), "Node uncordonned successfully") - return nil -} - -func addNodeAnnotations(client *kubernetes.Clientset, nodeID string, annotations map[string]string) error { - node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{}) - if err != nil { - return fmt.Errorf("error retrieving node object via k8s API: %v", err) - } - for k, v := range annotations { - node.Annotations[k] = v - slog.Debug(fmt.Sprintf("adding node annotation: %s=%s", k, v), "node", node.GetName()) - } - - bytes, err := json.Marshal(node) - if err != nil { - return fmt.Errorf("error marshalling node object into JSON: %v", err) - } - - _, err = client.CoreV1().Nodes().Patch(context.TODO(), node.GetName(), types.StrategicMergePatchType, bytes, metav1.PatchOptions{}) - if err != nil { - var annotationsErr string - for k, v := range annotations { - annotationsErr += fmt.Sprintf("%s=%s ", k, v) - } - return fmt.Errorf("error adding node annotations %s via k8s API: %v", annotationsErr, err) - } - return nil -} - -func deleteNodeAnnotation(client *kubernetes.Clientset, nodeID, key string) error { - // JSON Patch takes as path input a JSON Pointer, defined in RFC6901 - // So we replace all instances of "/" with "~1" as per: - // https://tools.ietf.org/html/rfc6901#section-3 - patch := []byte(fmt.Sprintf("[{\"op\":\"remove\",\"path\":\"/metadata/annotations/%s\"}]", strings.ReplaceAll(key, "/", "~1"))) - _, err := client.CoreV1().Nodes().Patch(context.TODO(), nodeID, types.JSONPatchType, patch, metav1.PatchOptions{}) - if err != nil { - return fmt.Errorf("error deleting node annotation %s via k8s API: %v", key, err) - } - return nil -} - -func updateNodeLabels(client *kubernetes.Clientset, node *v1.Node, labels []string) error { - labelsMap := make(map[string]string) - for _, label := range labels { - k := strings.Split(label, "=")[0] - v := strings.Split(label, "=")[1] - labelsMap[k] = v - slog.Debug(fmt.Sprintf("Updating node %s label: %s=%s", node.GetName(), k, v), "node", node.GetName()) - } - - bytes, err := json.Marshal(map[string]interface{}{ - "metadata": map[string]interface{}{ - "labels": labelsMap, - }, - }) - if err != nil { - return fmt.Errorf("error marshalling node object into JSON: %v", err) - } - - _, err = client.CoreV1().Nodes().Patch(context.TODO(), node.GetName(), types.StrategicMergePatchType, bytes, metav1.PatchOptions{}) - if err != nil { - var labelsErr string - for _, label := range labels { - k := strings.Split(label, "=")[0] - v := strings.Split(label, "=")[1] - labelsErr += fmt.Sprintf("%s=%s ", k, v) - } - return fmt.Errorf("error updating node labels %s via k8s API: %v", labelsErr, err) - } - return nil -} - func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers.Checker, blockCheckers []blockers.RebootBlocker, window *timewindow.TimeWindow, lock daemonsetlock.Lock, client *kubernetes.Clientset, period time.Duration, notifier notifications.Notifier) { - preferNoScheduleTaint := taints.New(client, nodeID, preferNoScheduleTaintName, v1.TaintEffectPreferNoSchedule) + preferNoScheduleTaint := k8soperations.NewTaint(client, nodeID, preferNoScheduleTaintName, v1.TaintEffectPreferNoSchedule) // No reason to delay the first ticks. // On top of it, we used to leak a goroutine, which was never garbage collected. @@ -559,7 +362,7 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. continue } - err = uncordon(client, node, notifier) + err = k8soperations.Uncordon(client, node, notifier, postRebootNodeLabels, messageTemplateUncordon) if err != nil { // Might be a transient API issue or a real problem. Inform the admin slog.Info("unable to uncordon needs investigation", "node", nodeID, "error", err) @@ -579,7 +382,7 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. if _, ok := node.Annotations[KuredRebootInProgressAnnotation]; ok { // Who reads this? I hope nobody bothers outside real debug cases slog.Debug(fmt.Sprintf("Deleting node %s annotation %s", nodeID, KuredRebootInProgressAnnotation), "node", nodeID) - err := deleteNodeAnnotation(client, nodeID, KuredRebootInProgressAnnotation) + err := k8soperations.DeleteNodeAnnotation(client, nodeID, KuredRebootInProgressAnnotation) if err != nil { continue } @@ -627,7 +430,7 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. annotations := map[string]string{KuredRebootInProgressAnnotation: timeNowString} // & annotate this node with a timestamp so that other node maintenance tools know how long it's been since this node has been marked for reboot annotations[KuredMostRecentRebootNeededAnnotation] = timeNowString - err := addNodeAnnotations(client, nodeID, annotations) + err := k8soperations.AddNodeAnnotations(client, nodeID, annotations) if err != nil { continue } @@ -664,7 +467,8 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. // } //} - err = drain(client, node, notifier) + err = k8soperations.Drain(client, node, preRebootNodeLabels, drainTimeout, drainGracePeriod, skipWaitForDeleteTimeoutSeconds, drainPodSelector, drainDelay, messageTemplateDrain, notifier) + if err != nil { if !forceReboot { slog.Debug(fmt.Sprintf("Unable to cordon or drain %s: %v, will force-reboot by releasing lock and uncordon until next success", node.GetName(), err), "node", nodeID, "error", err) @@ -677,7 +481,7 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. // If shown, it is helping understand the uncordonning. If the admin seems the node as cordonned // with this, it needs to take action (for example if the node was previously cordonned!) slog.Info("Performing a best-effort uncordon after failed cordon and drain", "node", nodeID) - err := uncordon(client, node, notifier) + err := k8soperations.Uncordon(client, node, notifier, postRebootNodeLabels, messageTemplateUncordon) if err != nil { slog.Info("Uncordon failed", "error", err) } diff --git a/internal/k8soperations/annotations.go b/internal/k8soperations/annotations.go new file mode 100644 index 000000000..492a901ff --- /dev/null +++ b/internal/k8soperations/annotations.go @@ -0,0 +1,86 @@ +package k8soperations + +import ( + "context" + "encoding/json" + "fmt" + "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/kubernetes" + "log/slog" + "strings" +) + +const ( + KuredNodeWasUnschedulableBeforeDrainAnnotation string = "kured.dev/node-unschedulable-before-drain" +) + +func AddNodeAnnotations(client *kubernetes.Clientset, nodeID string, annotations map[string]string) error { + node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("error retrieving node object via k8s API: %v", err) + } + for k, v := range annotations { + node.Annotations[k] = v + slog.Debug(fmt.Sprintf("adding node annotation: %s=%s", k, v), "node", node.GetName()) + } + + bytes, err := json.Marshal(node) + if err != nil { + return fmt.Errorf("error marshalling node object into JSON: %v", err) + } + + _, err = client.CoreV1().Nodes().Patch(context.TODO(), node.GetName(), types.StrategicMergePatchType, bytes, metav1.PatchOptions{}) + if err != nil { + var annotationsErr string + for k, v := range annotations { + annotationsErr += fmt.Sprintf("%s=%s ", k, v) + } + return fmt.Errorf("error adding node annotations %s via k8s API: %v", annotationsErr, err) + } + return nil +} + +func DeleteNodeAnnotation(client *kubernetes.Clientset, nodeID, key string) error { + // JSON Patch takes as path input a JSON Pointer, defined in RFC6901 + // So we replace all instances of "/" with "~1" as per: + // https://tools.ietf.org/html/rfc6901#section-3 + patch := []byte(fmt.Sprintf("[{\"op\":\"remove\",\"path\":\"/metadata/annotations/%s\"}]", strings.ReplaceAll(key, "/", "~1"))) + _, err := client.CoreV1().Nodes().Patch(context.TODO(), nodeID, types.JSONPatchType, patch, metav1.PatchOptions{}) + if err != nil { + return fmt.Errorf("error deleting node annotation %s via k8s API: %v", key, err) + } + return nil +} + +func updateNodeLabels(client *kubernetes.Clientset, node *v1.Node, labels []string) error { + labelsMap := make(map[string]string) + for _, label := range labels { + k := strings.Split(label, "=")[0] + v := strings.Split(label, "=")[1] + labelsMap[k] = v + slog.Debug(fmt.Sprintf("Updating node %s label: %s=%s", node.GetName(), k, v), "node", node.GetName()) + } + + bytes, err := json.Marshal(map[string]interface{}{ + "metadata": map[string]interface{}{ + "labels": labelsMap, + }, + }) + if err != nil { + return fmt.Errorf("error marshalling node object into JSON: %v", err) + } + + _, err = client.CoreV1().Nodes().Patch(context.TODO(), node.GetName(), types.StrategicMergePatchType, bytes, metav1.PatchOptions{}) + if err != nil { + var labelsErr string + for _, label := range labels { + k := strings.Split(label, "=")[0] + v := strings.Split(label, "=")[1] + labelsErr += fmt.Sprintf("%s=%s ", k, v) + } + return fmt.Errorf("error updating node labels %s via k8s API: %v", labelsErr, err) + } + return nil +} diff --git a/internal/k8soperations/drain.go b/internal/k8soperations/drain.go new file mode 100644 index 000000000..506629f4a --- /dev/null +++ b/internal/k8soperations/drain.go @@ -0,0 +1,124 @@ +package k8soperations + +import ( + "context" + "fmt" + "github.com/kubereboot/kured/internal/notifications" + "k8s.io/api/core/v1" + "k8s.io/client-go/kubernetes" + kubeDrain "k8s.io/kubectl/pkg/drain" + "log/slog" + "strconv" + "time" +) + +// Drain drains the node in a kured fashion, respecting delays, notifications, and applying labels/annotations. +func Drain(client *kubernetes.Clientset, node *v1.Node, preRebootNodeLabels []string, drainTimeout time.Duration, drainGracePeriod int, skipWaitForDeleteTimeoutSeconds int, drainPodSelector string, drainDelay time.Duration, messageTemplateDrain string, notifier notifications.Notifier) error { + nodeName := node.GetName() + + if preRebootNodeLabels != nil { + err := updateNodeLabels(client, node, preRebootNodeLabels) + if err != nil { + return fmt.Errorf("stopping drain due to problem with node labels %v", err) + } + } + + if drainDelay > 0 { + slog.Debug("Delaying drain", "delay", drainDelay, "node", nodeName) + time.Sleep(drainDelay) + } + + slog.Info("Starting drain", "node", nodeName) + + notifier.Send(fmt.Sprintf(messageTemplateDrain, nodeName), "Starting drain") + + kubectlStdOutLogger := &slogWriter{message: "draining: results", stream: "stdout"} + kubectlStdErrLogger := &slogWriter{message: "draining: results", stream: "stderr"} + + drainer := &kubeDrain.Helper{ + Client: client, + Ctx: context.Background(), + GracePeriodSeconds: drainGracePeriod, + PodSelector: drainPodSelector, + SkipWaitForDeleteTimeoutSeconds: skipWaitForDeleteTimeoutSeconds, + Force: true, + DeleteEmptyDirData: true, + IgnoreAllDaemonSets: true, + ErrOut: kubectlStdErrLogger, + Out: kubectlStdOutLogger, + Timeout: drainTimeout, + } + + // Add previous state of the node Spec.Unschedulable into an annotation + // If an annotation was present, it means that either the cordon or drain failed, + // hence it does not need to reapply: It might override what the user has set + // (for example if the cordon succeeded but the drain failed) + if _, ok := node.Annotations[KuredNodeWasUnschedulableBeforeDrainAnnotation]; !ok { + // Store State of the node before cordon changes it + annotations := map[string]string{KuredNodeWasUnschedulableBeforeDrainAnnotation: strconv.FormatBool(node.Spec.Unschedulable)} + // & annotate this node with a timestamp so that other node maintenance tools know how long it's been since this node has been marked for reboot + err := AddNodeAnnotations(client, nodeName, annotations) + if err != nil { + return fmt.Errorf("error saving state of the node %s, %v", nodeName, err) + } + } + + if err := kubeDrain.RunCordonOrUncordon(drainer, node, true); err != nil { + return fmt.Errorf("error cordonning node %s, %v", nodeName, err) + } + + if err := kubeDrain.RunNodeDrain(drainer, nodeName); err != nil { + return fmt.Errorf("error draining node %s: %v", nodeName, err) + } + return nil +} + +// Uncordon changes the `spec.Unschedulable` field of a node, applying kured labels and annotations. +// Is a noop on missing annotation or on a node in maintenance before kured action +func Uncordon(client *kubernetes.Clientset, node *v1.Node, notifier notifications.Notifier, postRebootNodeLabels []string, messageTemplateUncordon string) error { + nodeName := node.GetName() + // Revert cordon spec change with the help of node annotation + annotationContent, ok := node.Annotations[KuredNodeWasUnschedulableBeforeDrainAnnotation] + if !ok { + // If no node annotations, uncordon will not act. + // Do not uncordon if you do not know previous state, it could bring nodes under maintenance online! + return nil + } + + wasUnschedulable, err := strconv.ParseBool(annotationContent) + if err != nil { + return fmt.Errorf("annotation was edited and cannot be converted back to bool %v, cannot uncordon (unrecoverable)", err) + } + + if wasUnschedulable { + // Just delete the annotation, keep Cordonned + err := DeleteNodeAnnotation(client, nodeName, KuredNodeWasUnschedulableBeforeDrainAnnotation) + if err != nil { + return fmt.Errorf("error removing the WasUnschedulable annotation, keeping the node stuck in cordonned state forever %v", err) + } + return nil + } + + kubectlStdOutLogger := &slogWriter{message: "uncordon: results", stream: "stdout"} + kubectlStdErrLogger := &slogWriter{message: "uncordon: results", stream: "stderr"} + + drainer := &kubeDrain.Helper{ + Client: client, + ErrOut: kubectlStdErrLogger, + Out: kubectlStdOutLogger, + Ctx: context.Background(), + } + if err := kubeDrain.RunCordonOrUncordon(drainer, node, false); err != nil { + return fmt.Errorf("error uncordonning node %s: %v", nodeName, err) + } else if postRebootNodeLabels != nil { + err := updateNodeLabels(client, node, postRebootNodeLabels) + return fmt.Errorf("error updating node (%s) labels, needs manual intervention %v", nodeName, err) + } + + err = DeleteNodeAnnotation(client, nodeName, KuredNodeWasUnschedulableBeforeDrainAnnotation) + if err != nil { + return fmt.Errorf("error removing the WasUnschedulable annotation, keeping the node stuck in current state forever %v", err) + } + notifier.Send(fmt.Sprintf(messageTemplateUncordon, nodeName), "Node uncordonned successfully") + return nil +} diff --git a/internal/k8soperations/logging.go b/internal/k8soperations/logging.go new file mode 100644 index 000000000..15cf4fb9f --- /dev/null +++ b/internal/k8soperations/logging.go @@ -0,0 +1,19 @@ +package k8soperations + +import "log/slog" + +type slogWriter struct { + stream string + message string +} + +func (sw slogWriter) Write(p []byte) (n int, err error) { + output := string(p) + switch sw.stream { + case "stdout": + slog.Info(sw.message, "stdout", output) + case "stderr": + slog.Info(sw.message, "stderr", output) + } + return len(p), nil +} diff --git a/internal/taints/taints.go b/internal/k8soperations/taints.go similarity index 96% rename from internal/taints/taints.go rename to internal/k8soperations/taints.go index 1d92cc56f..2bed538f7 100644 --- a/internal/taints/taints.go +++ b/internal/k8soperations/taints.go @@ -1,4 +1,4 @@ -package taints +package k8soperations import ( "context" @@ -21,8 +21,8 @@ type Taint struct { exists bool } -// New provides a new taint. -func New(client *kubernetes.Clientset, nodeID, taintName string, effect v1.TaintEffect) *Taint { +// NewTaint provides a new taint. +func NewTaint(client *kubernetes.Clientset, nodeID, taintName string, effect v1.TaintEffect) *Taint { exists, _, _ := taintExists(client, nodeID, taintName) return &Taint{ From 0df8f7c8830387ab12ecf37912f9b8fe556d6577 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Mon, 11 Nov 2024 15:48:26 +0100 Subject: [PATCH 22/22] Improve dev environnement Without this patch, while developping (on uncommitted work), the image with previous SHA gets overriden, which leads to mishaps. This fixes it by ensuring only the kured:dev tags (full path and short one) are used everywhere. At the same time, it cleans up the main_test to be more flexible by passing more of the main features as options. Signed-off-by: Jean-Philippe Evrard --- Makefile | 7 ++-- tests/kind/main_test.go | 77 +++++++++++++++++++++++++++++++++++------ 2 files changed, 71 insertions(+), 13 deletions(-) diff --git a/Makefile b/Makefile index c81b1d31a..dd05d9aff 100644 --- a/Makefile +++ b/Makefile @@ -24,7 +24,7 @@ bootstrap-tools: $(HACKDIR) clean: rm -rf ./dist -kured: bootstrap-tools +kured: clean bootstrap-tools $(GORELEASER_CMD) build --clean --single-target --snapshot kured-all: bootstrap-tools @@ -39,8 +39,9 @@ kured-release-snapshot: bootstrap-tools image: kured $(SUDO) docker buildx build --no-cache --load -t ghcr.io/$(DH_ORG)/kured:$(VERSION) . -dev-image: image - $(SUDO) docker tag ghcr.io/$(DH_ORG)/kured:$(VERSION) kured:dev +dev-image: kured + $(SUDO) docker buildx build --no-cache --load -t ghcr.io/$(DH_ORG)/kured:dev . + $(SUDO) docker tag ghcr.io/$(DH_ORG)/kured:dev kured:dev dev-manifest: # basic e2e scenario diff --git a/tests/kind/main_test.go b/tests/kind/main_test.go index fec78a917..8df4c02e7 100644 --- a/tests/kind/main_test.go +++ b/tests/kind/main_test.go @@ -84,14 +84,27 @@ func LocalImage(nameTag string) Option { } } +// Deploy can be passed to NewKind to deploy extra components, in addition to the base deployment. +func WithClusterName(name string) Option { + return func(k *KindTest) { + k.clusterName = name + } +} + +func ForTestInstance(t *testing.T) Option { + return func(k *KindTest) { + k.testInstance = t + } +} + // NewKind creates a kind cluster given a name and set of Option instances. -func NewKindTester(kindClusterName string, filePath string, t *testing.T, options ...Option) *KindTest { +func NewKindTester(config string, options ...Option) *KindTest { k := &KindTest{ - clusterName: kindClusterName, + clusterName: "kured", timeout: 10 * time.Minute, - kindConfigPath: filePath, - testInstance: t, + kindConfigPath: config, + testInstance: nil, } for _, option := range options { option(k) @@ -157,7 +170,14 @@ func TestE2EWithCommand(t *testing.T) { kindClusterConfigFile := fmt.Sprintf("../../.github/kind-cluster-%v.yaml", version) kindContext := fmt.Sprintf("kind-%v", kindClusterName) - k := NewKindTester(kindClusterName, kindClusterConfigFile, t, LocalImage(kuredDevImage), Deploy("../../kured-rbac.yaml"), Deploy("testfiles/kured-ds.yaml")) + k := NewKindTester( + kindClusterConfigFile, + ForTestInstance(t), + WithClusterName(kindClusterName), + LocalImage(kuredDevImage), + Deploy("../../kured-rbac.yaml"), + Deploy("testfiles/kured-ds.yaml"), + ) defer k.FlushLog() err := k.Create() @@ -207,7 +227,14 @@ func TestE2EWithSignal(t *testing.T) { kindClusterConfigFile := fmt.Sprintf("../../.github/kind-cluster-%v.yaml", version) kindContext := fmt.Sprintf("kind-%v", kindClusterName) - k := NewKindTester(kindClusterName, kindClusterConfigFile, t, LocalImage(kuredDevImage), Deploy("../../kured-rbac.yaml"), Deploy("testfiles/kured-ds-signal.yaml")) + k := NewKindTester( + kindClusterConfigFile, + ForTestInstance(t), + WithClusterName(kindClusterName), + LocalImage(kuredDevImage), + Deploy("../../kured-rbac.yaml"), + Deploy("testfiles/kured-ds-signal.yaml"), + ) defer k.FlushLog() err := k.Create() @@ -257,7 +284,14 @@ func TestE2EConcurrentWithCommand(t *testing.T) { kindClusterConfigFile := fmt.Sprintf("../../.github/kind-cluster-%v.yaml", version) kindContext := fmt.Sprintf("kind-%v", kindClusterName) - k := NewKindTester(kindClusterName, kindClusterConfigFile, t, LocalImage(kuredDevImage), Deploy("../../kured-rbac.yaml"), Deploy("testfiles/kured-ds-concurrent-command.yaml")) + k := NewKindTester( + kindClusterConfigFile, + ForTestInstance(t), + WithClusterName(kindClusterName), + LocalImage(kuredDevImage), + Deploy("../../kured-rbac.yaml"), + Deploy("testfiles/kured-ds-concurrent-command.yaml"), + ) defer k.FlushLog() err := k.Create() @@ -307,7 +341,14 @@ func TestE2EConcurrentWithSignal(t *testing.T) { kindClusterConfigFile := fmt.Sprintf("../../.github/kind-cluster-%v.yaml", version) kindContext := fmt.Sprintf("kind-%v", kindClusterName) - k := NewKindTester(kindClusterName, kindClusterConfigFile, t, LocalImage(kuredDevImage), Deploy("../../kured-rbac.yaml"), Deploy("testfiles/kured-ds-concurrent-signal.yaml")) + k := NewKindTester( + kindClusterConfigFile, + ForTestInstance(t), + WithClusterName(kindClusterName), + LocalImage(kuredDevImage), + Deploy("../../kured-rbac.yaml"), + Deploy("testfiles/kured-ds-concurrent-signal.yaml"), + ) defer k.FlushLog() err := k.Create() @@ -362,7 +403,16 @@ func TestCordonningIsKept(t *testing.T) { } else { manifest = "testfiles/kured-ds-concurrent-signal.yaml" } - k := NewKindTester(kindClusterName, kindClusterConfigFile, t, LocalImage(kuredDevImage), Deploy("../../kured-rbac.yaml"), Deploy(manifest)) + + k := NewKindTester( + kindClusterConfigFile, + ForTestInstance(t), + WithClusterName(kindClusterName), + LocalImage(kuredDevImage), + Deploy("../../kured-rbac.yaml"), + Deploy(manifest), + ) + defer k.FlushLog() err := k.Create() @@ -405,7 +455,14 @@ func TestE2EBlocker(t *testing.T) { kindClusterConfigFile := "../../.github/kind-cluster-next.yaml" kindContext := fmt.Sprintf("kind-%v", kindClusterName) - k := NewKindTester(kindClusterName, kindClusterConfigFile, t, LocalImage(kuredDevImage), Deploy("../../kured-rbac.yaml"), Deploy(fmt.Sprintf("testfiles/kured-ds-%v.yaml", variant))) + k := NewKindTester( + kindClusterConfigFile, + ForTestInstance(t), + WithClusterName(kindClusterName), + LocalImage(kuredDevImage), + Deploy("../../kured-rbac.yaml"), + Deploy(fmt.Sprintf("testfiles/kured-ds-%v.yaml", variant)), + ) defer k.FlushLog() err := k.Create()