From fb1db9ab7645619d849937dabe05e576d746f8a0 Mon Sep 17 00:00:00 2001 From: Gerrit Date: Mon, 10 Jan 2022 14:24:22 +0100 Subject: [PATCH] Make metallb state become part of shoot health. (#228) --- .../templates/metallb.yaml | 54 +++++++++++++ pkg/controller/healthcheck/add.go | 4 + pkg/controller/healthcheck/metallb.go | 78 +++++++++++++++++++ 3 files changed, 136 insertions(+) create mode 100644 pkg/controller/healthcheck/metallb.go diff --git a/charts/internal/shoot-control-plane/templates/metallb.yaml b/charts/internal/shoot-control-plane/templates/metallb.yaml index 8442e4e6b..30dee03f9 100644 --- a/charts/internal/shoot-control-plane/templates/metallb.yaml +++ b/charts/internal/shoot-control-plane/templates/metallb.yaml @@ -421,6 +421,27 @@ spec: drop: - all readOnlyRootFilesystem: true + + # we write the controller health state into a config map to make + # the state easily check-able through Gardener and make it part of + # the shoot health + # + # helps us to identify stale configs easily: + # https://github.com/metallb/metallb/issues/462 + - name: health + image: "bitnami/kubectl:latest" + command: + - bash + - -c + - | + set -eo pipefail + while true; do + sleep 30 + stale=$(curl -m 3 -s localhost:7472/metrics | grep '^metallb_k8s_client_config_stale_bool' | awk '{ print $2 }') + loaded=$(curl -m 3 -s localhost:7472/metrics | grep '^metallb_k8s_client_config_loaded_bool' | awk '{ print $2 }') + kubectl create --save-config configmap health --dry-run=client -o yaml --from-literal=configStale=${stale} --from-literal=configLoaded=${loaded} | kubectl apply -f - + done + nodeSelector: kubernetes.io/os: linux securityContext: @@ -429,3 +450,36 @@ spec: fsGroup: 65534 serviceAccountName: controller terminationGracePeriodSeconds: 0 + +--- +# belongs to the health check sidecar +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + labels: + app: metallb + name: health-monitoring + namespace: metallb-system +rules: +- apiGroups: + - '' + resources: + - configmaps + verbs: + - create + - patch +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + labels: + app: metallb + name: health-monitoring + namespace: metallb-system +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: health-monitoring +subjects: +- kind: ServiceAccount + name: controller diff --git a/pkg/controller/healthcheck/add.go b/pkg/controller/healthcheck/add.go index a9b6b585a..bcd218315 100644 --- a/pkg/controller/healthcheck/add.go +++ b/pkg/controller/healthcheck/add.go @@ -94,6 +94,10 @@ func RegisterHealthChecks(mgr manager.Manager, opts AddOptions) error { HealthCheck: CheckDuros(metal.DurosResourceName), PreCheckFunc: durosPreCheck, }, + { + ConditionType: string(gardencorev1beta1.ShootSystemComponentsHealthy), + HealthCheck: CheckMetalLB(), + }, }); err != nil { return err } diff --git a/pkg/controller/healthcheck/metallb.go b/pkg/controller/healthcheck/metallb.go new file mode 100644 index 000000000..3bce1e3a0 --- /dev/null +++ b/pkg/controller/healthcheck/metallb.go @@ -0,0 +1,78 @@ +package healthcheck + +import ( + "context" + "fmt" + + "github.com/gardener/gardener/extensions/pkg/controller/healthcheck" + gardencorev1beta1 "github.com/gardener/gardener/pkg/apis/core/v1beta1" + + "github.com/go-logr/logr" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" +) + +// MetalLBHealthChecker contains all the information for the MetalLB HealthCheck +type MetalLBHealthChecker struct { + logger logr.Logger + shootClient client.Client +} + +// CheckMetalLB is a healthCheck function to check MetalLBs +func CheckMetalLB() healthcheck.HealthCheck { + return &MetalLBHealthChecker{} +} + +// shootClient injects the shoot client +func (healthChecker *MetalLBHealthChecker) InjectShootClient(shootClient client.Client) { + healthChecker.shootClient = shootClient +} + +// SetLoggerSuffix injects the logger +func (healthChecker *MetalLBHealthChecker) SetLoggerSuffix(provider, extension string) { + healthChecker.logger = log.Log.WithName(fmt.Sprintf("%s-%s-healthcheck-metallb", provider, extension)) +} + +// DeepCopy clones the healthCheck struct by making a copy and returning the pointer to that new copy +func (healthChecker *MetalLBHealthChecker) DeepCopy() healthcheck.HealthCheck { + copy := *healthChecker + return © +} + +// Check executes the health check +func (healthChecker *MetalLBHealthChecker) Check(ctx context.Context, request types.NamespacedName) (*healthcheck.SingleCheckResult, error) { + health := &v1.ConfigMap{} + + if err := healthChecker.shootClient.Get(ctx, client.ObjectKey{Namespace: "metallb-system", Name: "health"}, health); err != nil { + err := fmt.Errorf("check metallb health configmap failed. Unable to retrieve 'health' in namespace 'metallb-system': %v", err) + healthChecker.logger.Error(err, "Health check failed") + return nil, err + } + if isHealthy, err := IsHealthy(health); !isHealthy { + healthChecker.logger.Error(err, "Health check failed") + return &healthcheck.SingleCheckResult{ + Status: gardencorev1beta1.ConditionFalse, + Detail: err.Error(), + }, nil + } + + return &healthcheck.SingleCheckResult{ + Status: gardencorev1beta1.ConditionTrue, + }, nil +} + +func IsHealthy(health *v1.ConfigMap) (bool, error) { + isLoaded := health.Data["configLoaded"] + if isLoaded != "1" { + return false, fmt.Errorf("metallb configmap is not loaded") + } + + isStale := health.Data["configStale"] + if isStale == "1" { + return false, fmt.Errorf("metallb configmap is stale / erroneous, next speaker reload may interrupt workload traffic") + } + + return true, nil +}