From 5e09376605c6a45ffd255b7ca39de988629fe97b Mon Sep 17 00:00:00 2001 From: Fabian Ruff Date: Thu, 14 Jul 2022 17:16:51 +0200 Subject: [PATCH] Scrape kluster metrics This PR adds to features: * It allows us to centrally scrape the kluster metrics (apiserver, ccm, cm scheduler) into a dedicated prometheus with short retention * It exposes scheduler, cm, ccm metrics via an ingress so that users can start collecting cluster metrics themselves. The metrics of all components (except etcd) are protected by kubernetes RBAC, so a service account token with RBAC permissions for `/metrics` is required. --- charts/kube-master/templates/_helpers.tpl | 8 +- charts/kube-master/templates/api.yaml | 4 +- .../templates/cloud-controller-manager.yaml | 4 + charts/kube-master/templates/ingress.yaml | 50 ++++++++++++ charts/kube-master/templates/podmonitor.yaml | 42 ++++++++++ .../templates/service-metrics.yaml | 24 +++++- charts/kube-master/values.yaml | 3 + pkg/apis/kubernikus/v1/secret.go | 3 +- pkg/controller/ground/bootstrap.go | 77 +++++++++++++++++++ pkg/controller/ground/bootstrap/util.go | 26 ------- pkg/migration/22_seed_kubernikus_sa.go | 23 ++++++ pkg/migration/register.go | 1 + 12 files changed, 232 insertions(+), 33 deletions(-) create mode 100644 charts/kube-master/templates/podmonitor.yaml create mode 100644 pkg/migration/22_seed_kubernikus_sa.go diff --git a/charts/kube-master/templates/_helpers.tpl b/charts/kube-master/templates/_helpers.tpl index 01cfe3b44a..5889a0cd10 100644 --- a/charts/kube-master/templates/_helpers.tpl +++ b/charts/kube-master/templates/_helpers.tpl @@ -119,10 +119,14 @@ We truncate at 63 chars because some Kubernetes name fields are limited to this {{- required "tag for fluentd missing" .Values.images.fluentd.tag }} {{- end -}} +{{- define "ingress.base" -}} + {{- .Values.api.apiserverHost | replace (include "master.fullname" .) (printf "%s.ingress" (include "master.fullname" .) ) -}} +{{- end -}} + {{- define "dashboard.url" -}} - {{- printf "dashboard-%s" ( .Values.api.apiserverHost | replace (include "master.fullname" .) (printf "%s.ingress" (include "master.fullname" .) ) ) -}} + dashboard-{{ include "ingress.base" . -}} {{- end -}} {{- define "dex.url" -}} - {{- printf "auth-%s" ( .Values.api.apiserverHost | replace (include "master.fullname" .) (printf "%s.ingress" (include "master.fullname" .) ) ) -}} + auth-{{ include "ingress.base" . -}} {{- end -}} diff --git a/charts/kube-master/templates/api.yaml b/charts/kube-master/templates/api.yaml index 0133bf72dc..d5c8f711b9 100644 --- a/charts/kube-master/templates/api.yaml +++ b/charts/kube-master/templates/api.yaml @@ -165,8 +165,8 @@ spec: containers: - name: apiserver ports: - - containerPort: 443 - name: server + - containerPort: {{ required "missing advertisePort" .Values.advertisePort }} + name: api protocol: TCP {{- if (semverCompare ">= 1.19" .Values.version.kubernetes) }} image: {{ include "apiserver.image" . | quote }} diff --git a/charts/kube-master/templates/cloud-controller-manager.yaml b/charts/kube-master/templates/cloud-controller-manager.yaml index d58a41bdef..d72c7a2b22 100644 --- a/charts/kube-master/templates/cloud-controller-manager.yaml +++ b/charts/kube-master/templates/cloud-controller-manager.yaml @@ -27,6 +27,7 @@ spec: template: metadata: labels: + component: cloud-controller-manager app: cloud-controller-manager kluster: {{ .Values.name }} account: {{ .Values.account }} @@ -99,6 +100,9 @@ spec: {{- end }} - --use-service-account-credentials=true - --concurrent-service-syncs=10 + ports: + - name: metrics + containerPort: 10258 livenessProbe: httpGet: path: /healthz diff --git a/charts/kube-master/templates/ingress.yaml b/charts/kube-master/templates/ingress.yaml index 28df44013c..4ecace7e64 100644 --- a/charts/kube-master/templates/ingress.yaml +++ b/charts/kube-master/templates/ingress.yaml @@ -50,4 +50,54 @@ spec: serviceName: {{ include "master.fullname" . }} servicePort: 6553 {{- end }} +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: {{ include "master.fullname" . }}-metrics + labels: + chart: "{{ .Chart.Name }}-{{ .Chart.Version | replace "+" "_" }}" + release: {{ .Release.Name }} + annotations: + nginx.ingress.kubernetes.io/backend-protocol: "HTTPS" + ingress.kubernetes.io/backend-protocol: "HTTPS" + +spec: + rules: + - host: cm-{{include "ingress.base" . }} + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: {{ include "master.fullname" . }}-cm + port: + number: 10257 + - host: ccm-{{include "ingress.base" . }} + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: {{ include "master.fullname" . }}-ccm + port: + number: 10258 + - host: scheduler-{{include "ingress.base" . }} + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: {{ include "master.fullname" . }}-sched + port: + number: 10259 + tls: + - hosts: + - cm-{{include "ingress.base" . }} + - ccm-{{include "ingress.base" . }} + - scheduler-{{include "ingress.base" . }} + secretName: {{ required "dex.ingressSecret undefined" .Values.dex.ingressSecret }} {{- end }} diff --git a/charts/kube-master/templates/podmonitor.yaml b/charts/kube-master/templates/podmonitor.yaml new file mode 100644 index 0000000000..e0831e0173 --- /dev/null +++ b/charts/kube-master/templates/podmonitor.yaml @@ -0,0 +1,42 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + labels: + prometheus: {{ .Values.metrics.prometheus }} + name: {{ include "master.fullname" . }} +spec: + jobLabel: {{ include "master.fullname" . }} + namespaceSelector: + matchNames: + - {{ .Release.Namespace }} + podMetricsEndpoints: + - interval: 60s + port: api + relabelings: + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + scheme: https + bearerTokenSecret: + name: {{ include "master.fullname" . }}-secret + key: serviceAccount + tlsConfig: + insecureSkipVerify: true + scrapeTimeout: 10s + - interval: 60s + port: metrics + relabelings: + - action: drop + regex: ^etcd$ + sourceLabels: [__meta_kubernetes_pod_label_component] + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + scheme: https + bearerTokenSecret: + name: {{ include "master.fullname" . }}-secret + key: serviceAccount + tlsConfig: + insecureSkipVerify: true + scrapeTimeout: 10s + selector: + matchLabels: + release: {{ include "master.fullname" . }} diff --git a/charts/kube-master/templates/service-metrics.yaml b/charts/kube-master/templates/service-metrics.yaml index 79a3610e3e..429164c879 100644 --- a/charts/kube-master/templates/service-metrics.yaml +++ b/charts/kube-master/templates/service-metrics.yaml @@ -5,8 +5,9 @@ metadata: labels: component: controller-manager-metrics release: {{ .Release.Name }} - name: {{ .Release.Name }}-cm-met + name: {{ .Release.Name }}-cm spec: + clusterIP: None ports: - name: metrics port: 10257 @@ -18,12 +19,31 @@ spec: --- apiVersion: v1 kind: Service +metadata: + labels: + component: cloud-controller-manager-metrics + release: {{ .Release.Name }} + name: {{ .Release.Name }}-ccm +spec: + clusterIP: None + ports: + - name: metrics + port: 10258 + protocol: TCP + targetPort: 10258 + selector: + component: cloud-controller-manager + release: {{ .Release.Name }} +--- +apiVersion: v1 +kind: Service metadata: labels: component: scheduler-metrics release: {{ .Release.Name }} - name: {{ .Release.Name }}-sched-met + name: {{ .Release.Name }}-sched spec: + clusterIP: None ports: - name: metrics port: 10259 diff --git a/charts/kube-master/values.yaml b/charts/kube-master/values.yaml index 6eb5d93595..582caef9a3 100644 --- a/charts/kube-master/values.yaml +++ b/charts/kube-master/values.yaml @@ -173,3 +173,6 @@ csi: memory: 100Mi audit: "" + +metrics: + prometheusName: kubernikus-collector diff --git a/pkg/apis/kubernikus/v1/secret.go b/pkg/apis/kubernikus/v1/secret.go index c2978a6cff..2136968ce0 100644 --- a/pkg/apis/kubernikus/v1/secret.go +++ b/pkg/apis/kubernikus/v1/secret.go @@ -18,7 +18,8 @@ type Secret struct { Certificates - ExtraValues string `json:"extra-values,omitempty"` + ExtraValues string `json:"extra-values,omitempty"` + ServiceAccount string `json:"serviceAccount,omitempty"` } func NewSecret(secret *corev1.Secret) (*Secret, error) { diff --git a/pkg/controller/ground/bootstrap.go b/pkg/controller/ground/bootstrap.go index f715bb3fb7..c4c7bda005 100644 --- a/pkg/controller/ground/bootstrap.go +++ b/pkg/controller/ground/bootstrap.go @@ -3,12 +3,15 @@ package ground import ( "context" "fmt" + "time" "github.com/pkg/errors" + core_v1 "k8s.io/api/core/v1" rbac "k8s.io/api/rbac/v1" storage "k8s.io/api/storage/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" clientset "k8s.io/client-go/kubernetes" v1 "github.com/sapcc/kubernikus/pkg/apis/kubernikus/v1" @@ -85,6 +88,13 @@ func SeedKluster(clients config.Clients, factories config.Factories, images vers } } + if err := SeedKubernikusServiceAccount(kubernetes); err != nil { + return fmt.Errorf("Failed to seed kubernikus service account: %w", err) + } + if err := UpdateServiceAccountTokenInSecret(kluster, clients.Kubernetes, kubernetes); err != nil { + return fmt.Errorf("Failed to update sa token in cluster secret: %w", err) + } + if ok, _ := util.KlusterVersionConstraint(kluster, ">= 1.20"); ok { dynamicKubernetes, err := clients.Satellites.DynamicClientFor(kluster) if err != nil { @@ -410,3 +420,70 @@ func SeedOpenStackClusterRoleBindings(client clientset.Interface) error { return nil } + +func SeedKubernikusServiceAccount(client clientset.Interface) error { + err := bootstrap.CreateOrUpdateServiceAccount(client, &core_v1.ServiceAccount{ + ObjectMeta: metav1.ObjectMeta{ + Name: "kubernikus", + Namespace: "kube-system", + }, + }) + if err != nil { + return fmt.Errorf("Failed to ensure kubernikus serviceaccount: %w", err) + } + + return bootstrap.CreateOrUpdateClusterRoleBindingV1(client, &rbac.ClusterRoleBinding{ + ObjectMeta: metav1.ObjectMeta{ + Name: "kubernikus:monitor", + }, + RoleRef: rbac.RoleRef{ + APIGroup: rbac.GroupName, + Kind: "ClusterRole", + Name: "system:monitoring", + }, + Subjects: []rbac.Subject{ + { + Kind: "ServiceAccount", + Name: "kubernikus", + Namespace: "kube-system", + }, + }, + }) +} + +func UpdateServiceAccountTokenInSecret(kluster *v1.Kluster, cpClient clientset.Interface, klusterClient clientset.Interface) error { + + secretName := "" + err := wait.Poll(50*time.Millisecond, 2*time.Second, func() (bool, error) { + sa, err := klusterClient.CoreV1().ServiceAccounts("kube-system").Get(context.TODO(), "kubernikus", metav1.GetOptions{}) + if err != nil { + return false, err + } + if len(sa.Secrets) == 0 { + return false, nil + } + secretName = sa.Secrets[0].Name + return true, nil + }) + if err != nil { + return fmt.Errorf("Failed to get secret name from sa: %w", err) + } + klusterSecret, err := util.KlusterSecret(cpClient, kluster) + if err != nil { + return fmt.Errorf("Failed to get kluster secret: %w", err) + } + secret, err := klusterClient.CoreV1().Secrets("kube-system").Get(context.TODO(), secretName, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("Failed to fetch sa secret: %w", err) + } + token, ok := secret.Data["token"] + if !ok { + return errors.New("Secret is missing token field") + } + if klusterSecret.ServiceAccount == string(token) { + return nil + } + klusterSecret.ServiceAccount = string(token) + return util.UpdateKlusterSecret(cpClient, kluster, klusterSecret) + +} diff --git a/pkg/controller/ground/bootstrap/util.go b/pkg/controller/ground/bootstrap/util.go index 5019537e46..1632afe62e 100644 --- a/pkg/controller/ground/bootstrap/util.go +++ b/pkg/controller/ground/bootstrap/util.go @@ -129,19 +129,6 @@ func CreateOrUpdateClusterRoleBindingV1(client clientset.Interface, clusterRoleB return nil } -func CreateOrUpdateRoleBinding(client clientset.Interface, roleBinding *rbac.RoleBinding) error { - if _, err := client.RbacV1beta1().RoleBindings(roleBinding.Namespace).Create(context.TODO(), roleBinding, metav1.CreateOptions{}); err != nil { - if !apierrors.IsAlreadyExists(err) { - return errors.Wrap(err, "unable to create RBAC rolebinding") - } - - if _, err := client.RbacV1beta1().RoleBindings(roleBinding.Namespace).Update(context.TODO(), roleBinding, metav1.UpdateOptions{}); err != nil { - return errors.Wrap(err, "unable to update RBAC rolebinding") - } - } - return nil -} - func CreateOrUpdateRoleBindingV1(client clientset.Interface, roleBinding *rbac_v1.RoleBinding) error { if _, err := client.RbacV1().RoleBindings(roleBinding.Namespace).Create(context.TODO(), roleBinding, metav1.CreateOptions{}); err != nil { if !apierrors.IsAlreadyExists(err) { @@ -168,19 +155,6 @@ func CreateOrUpdateRole(client clientset.Interface, role *rbac_v1.Role) error { return nil } -func CreateOrUpdateClusterRole(client clientset.Interface, clusterRole *rbac.ClusterRole) error { - if _, err := client.RbacV1beta1().ClusterRoles().Create(context.TODO(), clusterRole, metav1.CreateOptions{}); err != nil { - if !apierrors.IsAlreadyExists(err) { - return errors.Wrap(err, "unable to create RBAC clusterrole") - } - - if _, err := client.RbacV1beta1().ClusterRoles().Update(context.TODO(), clusterRole, metav1.UpdateOptions{}); err != nil { - return errors.Wrap(err, "unable to update RBAC clusterrole") - } - } - return nil -} - func CreateOrUpdateClusterRoleV1(client clientset.Interface, clusterRole *rbac_v1.ClusterRole) error { if _, err := client.RbacV1().ClusterRoles().Create(context.TODO(), clusterRole, metav1.CreateOptions{}); err != nil { if !apierrors.IsAlreadyExists(err) { diff --git a/pkg/migration/22_seed_kubernikus_sa.go b/pkg/migration/22_seed_kubernikus_sa.go new file mode 100644 index 0000000000..85a63225ec --- /dev/null +++ b/pkg/migration/22_seed_kubernikus_sa.go @@ -0,0 +1,23 @@ +package migration + +import ( + "fmt" + + v1 "github.com/sapcc/kubernikus/pkg/apis/kubernikus/v1" + "github.com/sapcc/kubernikus/pkg/controller/config" + "github.com/sapcc/kubernikus/pkg/controller/ground" +) + +func SeedKubernikusServiceAccount(rawKluster []byte, current *v1.Kluster, clients config.Clients, factories config.Factories) (err error) { + kubernetes, err := clients.Satellites.ClientFor(current) + if err != nil { + return err + } + if err := ground.SeedKubernikusServiceAccount(kubernetes); err != nil { + return fmt.Errorf("Failed to seed kubernikus service account: %w", err) + } + if err := ground.UpdateServiceAccountTokenInSecret(current, clients.Kubernetes, kubernetes); err != nil { + return fmt.Errorf("Failed to update sa token in cluster secret: %w", err) + } + return nil +} diff --git a/pkg/migration/register.go b/pkg/migration/register.go index 60e98fd15c..38d23510b1 100644 --- a/pkg/migration/register.go +++ b/pkg/migration/register.go @@ -28,6 +28,7 @@ func init() { FixFlannelOnFlatcar, KlusterSecretOpenStackIds, Helm2to3, + SeedKubernikusServiceAccount, // <-- Insert new migrations at the end only! } }