Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Do not stop Kubernetes services on node removal if annotation is set. #681

Merged
merged 7 commits into from
Sep 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/src/snap/reference/annotations.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ the bootstrap configuration.
| Name | Description | Values |
|---------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------|
| `k8sd/v1alpha/lifecycle/skip-cleanup-kubernetes-node-on-remove` | If set, only microcluster and file cleanup are performed. This is helpful when an external controller (e.g., CAPI) manages the Kubernetes node lifecycle. By default, k8sd will remove the Kubernetes node when it is removed from the cluster. | "true"\|"false" |
| `k8sd/v1alpha/lifecycle/skip-stop-services-on-remove` | If set, the k8s services will not be stopped on the leaving node when removing the node. This is helpful when an external controller (e.g., CAPI) manages the Kubernetes node lifecycle. By default, all services are stopped on leaving nodes. | "true"\|"false" |

<!-- Links -->

Expand Down
5 changes: 4 additions & 1 deletion src/k8s/cmd/k8s/k8s_bootstrap_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,10 @@ var testCases = []testCase{
Enabled: utils.Pointer(true),
},
CloudProvider: utils.Pointer("external"),
Annotations: map[string]string{apiv1.AnnotationSkipCleanupKubernetesNodeOnRemove: "true"},
Annotations: map[string]string{
apiv1.AnnotationSkipCleanupKubernetesNodeOnRemove: "true",
apiv1.AnnotationSkipStopServicesOnRemove: "true",
},
},
ControlPlaneTaints: []string{"node-role.kubernetes.io/control-plane:NoSchedule"},
PodCIDR: utils.Pointer("10.100.0.0/16"),
Expand Down
1 change: 1 addition & 0 deletions src/k8s/cmd/k8s/testdata/bootstrap-config-full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ cluster-config:
cloud-provider: external
annotations:
k8sd/v1alpha/lifecycle/skip-cleanup-kubernetes-node-on-remove: true
k8sd/v1alpha/lifecycle/skip-stop-services-on-remove: true
control-plane-taints:
- node-role.kubernetes.io/control-plane:NoSchedule
pod-cidr: 10.100.0.0/16
Expand Down
2 changes: 1 addition & 1 deletion src/k8s/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ go 1.22.6
require (
dario.cat/mergo v1.0.0
github.com/canonical/go-dqlite v1.22.0
github.com/canonical/k8s-snap-api v1.0.5
github.com/canonical/k8s-snap-api v1.0.6
github.com/canonical/lxd v0.0.0-20240822122218-e7b2a7a83230
github.com/canonical/microcluster/v3 v3.0.0-20240827143335-f7a4d3984970
github.com/go-logr/logr v1.4.2
Expand Down
4 changes: 2 additions & 2 deletions src/k8s/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,8 @@ github.com/bugsnag/panicwrap v0.0.0-20151223152923-e2c28503fcd0 h1:nvj0OLI3YqYXe
github.com/bugsnag/panicwrap v0.0.0-20151223152923-e2c28503fcd0/go.mod h1:D/8v3kj0zr8ZAKg1AQ6crr+5VwKN5eIywRkfhyM/+dE=
github.com/canonical/go-dqlite v1.22.0 h1:DuJmfcREl4gkQJyvZzjl2GHFZROhbPyfdjDRQXpkOyw=
github.com/canonical/go-dqlite v1.22.0/go.mod h1:Uvy943N8R4CFUAs59A1NVaziWY9nJ686lScY7ywurfg=
github.com/canonical/k8s-snap-api v1.0.5 h1:49bgi6CGtFjCPweeTz55Sv/waKgCl6ftx4BqXt3RI9k=
github.com/canonical/k8s-snap-api v1.0.5/go.mod h1:LDPoIYCeYnfgOFrwVPJ/4edGU264w7BB7g0GsVi36AY=
github.com/canonical/k8s-snap-api v1.0.6 h1:hUJ59ol9romwUz82bYIumitobcuBQwKjWMnge1AhGzM=
github.com/canonical/k8s-snap-api v1.0.6/go.mod h1:LDPoIYCeYnfgOFrwVPJ/4edGU264w7BB7g0GsVi36AY=
github.com/canonical/lxd v0.0.0-20240822122218-e7b2a7a83230 h1:YOqZ+/14OPZ+/TOXpRHIX3KLT0C+wZVpewKIwlGUmW0=
github.com/canonical/lxd v0.0.0-20240822122218-e7b2a7a83230/go.mod h1:YVGI7HStOKsV+cMyXWnJ7RaMPaeWtrkxyIPvGWbgACc=
github.com/canonical/microcluster/v3 v3.0.0-20240827143335-f7a4d3984970 h1:UrnpglbXELlxtufdk6DGDytu2JzyzuS3WTsOwPrkQLI=
Expand Down
1 change: 1 addition & 0 deletions src/k8s/pkg/k8sd/api/worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,5 +86,6 @@ func (e *Endpoints) postWorkerInfo(s state.State, r *http.Request) response.Resp
KubeProxyClientCert: workerCertificates.KubeProxyClientCert,
KubeProxyClientKey: workerCertificates.KubeProxyClientKey,
K8sdPublicKey: cfg.Certificates.GetK8sdPublicKey(),
Annotations: cfg.Annotations,
})
}
2 changes: 1 addition & 1 deletion src/k8s/pkg/k8sd/app/hooks_bootstrap.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ import (
// onBootstrap is called after we bootstrap the first cluster node.
// onBootstrap configures local services then writes the cluster config on the database.
func (a *App) onBootstrap(ctx context.Context, s state.State, initConfig map[string]string) error {

// NOTE(neoaggelos): context timeout is passed over configuration, so that hook failures are propagated to the client
ctx, cancel := context.WithCancel(ctx)
defer cancel()
Expand Down Expand Up @@ -213,6 +212,7 @@ func (a *App) onBootstrapWorkerNode(ctx context.Context, s state.State, encodedT
CACert: utils.Pointer(response.CACert),
ClientCACert: utils.Pointer(response.ClientCACert),
},
Annotations: response.Annotations,
}

// Pre-init checks
Expand Down
23 changes: 13 additions & 10 deletions src/k8s/pkg/k8sd/app/hooks_remove.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,9 @@ func (a *App) onPreRemove(ctx context.Context, s state.State, force bool) (rerr
log.Error(err, "Failed to wait for node to finish microcluster join before removing. Continuing with the cleanup...")
}

if cfg, err := databaseutil.GetClusterConfig(ctx, s); err == nil {
if _, ok := cfg.Annotations[apiv1.AnnotationSkipCleanupKubernetesNodeOnRemove]; !ok {
cfg, err := databaseutil.GetClusterConfig(ctx, s)
if err == nil {
if _, ok := cfg.Annotations.Get(apiv1.AnnotationSkipCleanupKubernetesNodeOnRemove); !ok {
c, err := snap.KubernetesClient("")
if err != nil {
log.Error(err, "Failed to create Kubernetes client", err)
Expand Down Expand Up @@ -124,19 +125,21 @@ func (a *App) onPreRemove(ctx context.Context, s state.State, force bool) (rerr
log.Error(err, "Failed to unmark node as worker")
}

log.Info("Stopping worker services")
if err := snaputil.StopWorkerServices(ctx, snap); err != nil {
log.Error(err, "Failed to stop worker services")
}

log.Info("Cleaning up control plane certificates")
if _, err := setup.EnsureControlPlanePKI(snap, &pki.ControlPlanePKI{}); err != nil {
log.Error(err, "failed to cleanup control plane certificates")
}

log.Info("Stopping control plane services")
if err := snaputil.StopControlPlaneServices(ctx, snap); err != nil {
log.Error(err, "Failed to stop control-plane services")
if _, ok := cfg.Annotations.Get(apiv1.AnnotationSkipStopServicesOnRemove); !ok {
log.Info("Stopping worker services")
if err := snaputil.StopWorkerServices(ctx, snap); err != nil {
bschimke95 marked this conversation as resolved.
Show resolved Hide resolved
log.Error(err, "Failed to stop worker services")
}

log.Info("Stopping control plane services")
if err := snaputil.StopControlPlaneServices(ctx, snap); err != nil {
log.Error(err, "Failed to stop control-plane services")
}
}

return nil
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
cluster-config:
network:
enabled: true
dns:
enabled: true
metrics-server:
enabled: true
annotations:
k8sd/v1alpha/lifecycle/skip-stop-services-on-remove: true
54 changes: 54 additions & 0 deletions tests/integration/tests/test_clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,60 @@ def test_no_remove(instances: List[harness.Instance]):
assert len(nodes) == 3, "worker node should not have been removed from cluster"


@pytest.mark.node_count(3)
@pytest.mark.bootstrap_config(
(config.MANIFESTS_DIR / "bootstrap-skip-service-stop.yaml").read_text()
)
def test_skip_services_stop_on_remove(instances: List[harness.Instance]):
cluster_node = instances[0]
joining_cp = instances[1]
worker = instances[2]

join_token = util.get_join_token(cluster_node, joining_cp)
util.join_cluster(joining_cp, join_token)

join_token_worker = util.get_join_token(cluster_node, worker, "--worker")
util.join_cluster(worker, join_token_worker)

# We don't care if the node is ready or the CNI is up.
util.stubbornly(retries=5, delay_s=3).until(util.get_nodes(cluster_node) == 3)
bschimke95 marked this conversation as resolved.
Show resolved Hide resolved

cluster_node.exec(["k8s", "remove-node", joining_cp.id])
nodes = util.ready_nodes(cluster_node)
assert len(nodes) == 2, "cp node should have been removed from the cluster"
services = joining_cp.exec(
["snap", "services", "k8s"], capture_output=True, text=True
).stdout.split("\n")[1:-1]
bschimke95 marked this conversation as resolved.
Show resolved Hide resolved
print(services)
HomayoonAlimohammadi marked this conversation as resolved.
Show resolved Hide resolved
for service in services:
if "k8s-apiserver-proxy" in service:
assert (
" inactive " in service
), "apiserver proxy should be inactive on control-plane"
else:
assert " active " in service, "service should be active"

cluster_node.exec(["k8s", "remove-node", worker.id])
nodes = util.ready_nodes(cluster_node)
assert len(nodes) == 1, "worker node should have been removed from the cluster"
services = worker.exec(
["snap", "services", "k8s"], capture_output=True, text=True
).stdout.split("\n")[1:-1]
print(services)
bschimke95 marked this conversation as resolved.
Show resolved Hide resolved
for service in services:
for expected_active_service in [
"containerd",
"k8sd",
"kubelet",
"kube-proxy",
"k8s-apiserver-proxy",
]:
if expected_active_service in service:
assert (
" active " in service
), f"{expected_active_service} should be active on worker"


@pytest.mark.node_count(3)
def test_join_with_custom_token_name(instances: List[harness.Instance]):
cluster_node = instances[0]
Expand Down
Loading