Skip to content

Commit

Permalink
Scrape kluster metrics
Browse files Browse the repository at this point in the history
This PR adds to features:
* It allows us to centrally scrape the kluster metrics (apiserver, ccm, cm scheduler) into a dedicated prometheus with short retention
* It exposes scheduler, cm, ccm metrics via an ingress so that users can start collecting cluster metrics themselves.

The metrics of all components (except etcd) are protected by kubernetes RBAC, so a service account token with RBAC permissions for `/metrics` is required.
  • Loading branch information
databus23 committed Jul 15, 2022
1 parent 670e696 commit 5e09376
Show file tree
Hide file tree
Showing 12 changed files with 232 additions and 33 deletions.
8 changes: 6 additions & 2 deletions charts/kube-master/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -119,10 +119,14 @@ We truncate at 63 chars because some Kubernetes name fields are limited to this
{{- required "tag for fluentd missing" .Values.images.fluentd.tag }}
{{- end -}}

{{- define "ingress.base" -}}
{{- .Values.api.apiserverHost | replace (include "master.fullname" .) (printf "%s.ingress" (include "master.fullname" .) ) -}}
{{- end -}}

{{- define "dashboard.url" -}}
{{- printf "dashboard-%s" ( .Values.api.apiserverHost | replace (include "master.fullname" .) (printf "%s.ingress" (include "master.fullname" .) ) ) -}}
dashboard-{{ include "ingress.base" . -}}
{{- end -}}

{{- define "dex.url" -}}
{{- printf "auth-%s" ( .Values.api.apiserverHost | replace (include "master.fullname" .) (printf "%s.ingress" (include "master.fullname" .) ) ) -}}
auth-{{ include "ingress.base" . -}}
{{- end -}}
4 changes: 2 additions & 2 deletions charts/kube-master/templates/api.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -165,8 +165,8 @@ spec:
containers:
- name: apiserver
ports:
- containerPort: 443
name: server
- containerPort: {{ required "missing advertisePort" .Values.advertisePort }}
name: api
protocol: TCP
{{- if (semverCompare ">= 1.19" .Values.version.kubernetes) }}
image: {{ include "apiserver.image" . | quote }}
Expand Down
4 changes: 4 additions & 0 deletions charts/kube-master/templates/cloud-controller-manager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ spec:
template:
metadata:
labels:
component: cloud-controller-manager
app: cloud-controller-manager
kluster: {{ .Values.name }}
account: {{ .Values.account }}
Expand Down Expand Up @@ -99,6 +100,9 @@ spec:
{{- end }}
- --use-service-account-credentials=true
- --concurrent-service-syncs=10
ports:
- name: metrics
containerPort: 10258
livenessProbe:
httpGet:
path: /healthz
Expand Down
50 changes: 50 additions & 0 deletions charts/kube-master/templates/ingress.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,54 @@ spec:
serviceName: {{ include "master.fullname" . }}
servicePort: 6553
{{- end }}
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: {{ include "master.fullname" . }}-metrics
labels:
chart: "{{ .Chart.Name }}-{{ .Chart.Version | replace "+" "_" }}"
release: {{ .Release.Name }}
annotations:
nginx.ingress.kubernetes.io/backend-protocol: "HTTPS"
ingress.kubernetes.io/backend-protocol: "HTTPS"

spec:
rules:
- host: cm-{{include "ingress.base" . }}
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: {{ include "master.fullname" . }}-cm
port:
number: 10257
- host: ccm-{{include "ingress.base" . }}
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: {{ include "master.fullname" . }}-ccm
port:
number: 10258
- host: scheduler-{{include "ingress.base" . }}
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: {{ include "master.fullname" . }}-sched
port:
number: 10259
tls:
- hosts:
- cm-{{include "ingress.base" . }}
- ccm-{{include "ingress.base" . }}
- scheduler-{{include "ingress.base" . }}
secretName: {{ required "dex.ingressSecret undefined" .Values.dex.ingressSecret }}
{{- end }}
42 changes: 42 additions & 0 deletions charts/kube-master/templates/podmonitor.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
labels:
prometheus: {{ .Values.metrics.prometheus }}
name: {{ include "master.fullname" . }}
spec:
jobLabel: {{ include "master.fullname" . }}
namespaceSelector:
matchNames:
- {{ .Release.Namespace }}
podMetricsEndpoints:
- interval: 60s
port: api
relabelings:
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
scheme: https
bearerTokenSecret:
name: {{ include "master.fullname" . }}-secret
key: serviceAccount
tlsConfig:
insecureSkipVerify: true
scrapeTimeout: 10s
- interval: 60s
port: metrics
relabelings:
- action: drop
regex: ^etcd$
sourceLabels: [__meta_kubernetes_pod_label_component]
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
scheme: https
bearerTokenSecret:
name: {{ include "master.fullname" . }}-secret
key: serviceAccount
tlsConfig:
insecureSkipVerify: true
scrapeTimeout: 10s
selector:
matchLabels:
release: {{ include "master.fullname" . }}
24 changes: 22 additions & 2 deletions charts/kube-master/templates/service-metrics.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@ metadata:
labels:
component: controller-manager-metrics
release: {{ .Release.Name }}
name: {{ .Release.Name }}-cm-met
name: {{ .Release.Name }}-cm
spec:
clusterIP: None
ports:
- name: metrics
port: 10257
Expand All @@ -18,12 +19,31 @@ spec:
---
apiVersion: v1
kind: Service
metadata:
labels:
component: cloud-controller-manager-metrics
release: {{ .Release.Name }}
name: {{ .Release.Name }}-ccm
spec:
clusterIP: None
ports:
- name: metrics
port: 10258
protocol: TCP
targetPort: 10258
selector:
component: cloud-controller-manager
release: {{ .Release.Name }}
---
apiVersion: v1
kind: Service
metadata:
labels:
component: scheduler-metrics
release: {{ .Release.Name }}
name: {{ .Release.Name }}-sched-met
name: {{ .Release.Name }}-sched
spec:
clusterIP: None
ports:
- name: metrics
port: 10259
Expand Down
3 changes: 3 additions & 0 deletions charts/kube-master/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -173,3 +173,6 @@ csi:
memory: 100Mi

audit: ""

metrics:
prometheusName: kubernikus-collector
3 changes: 2 additions & 1 deletion pkg/apis/kubernikus/v1/secret.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ type Secret struct {

Certificates

ExtraValues string `json:"extra-values,omitempty"`
ExtraValues string `json:"extra-values,omitempty"`
ServiceAccount string `json:"serviceAccount,omitempty"`
}

func NewSecret(secret *corev1.Secret) (*Secret, error) {
Expand Down
77 changes: 77 additions & 0 deletions pkg/controller/ground/bootstrap.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,15 @@ package ground
import (
"context"
"fmt"
"time"

"github.com/pkg/errors"
core_v1 "k8s.io/api/core/v1"
rbac "k8s.io/api/rbac/v1"
storage "k8s.io/api/storage/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
clientset "k8s.io/client-go/kubernetes"

v1 "github.com/sapcc/kubernikus/pkg/apis/kubernikus/v1"
Expand Down Expand Up @@ -85,6 +88,13 @@ func SeedKluster(clients config.Clients, factories config.Factories, images vers
}
}

if err := SeedKubernikusServiceAccount(kubernetes); err != nil {
return fmt.Errorf("Failed to seed kubernikus service account: %w", err)
}
if err := UpdateServiceAccountTokenInSecret(kluster, clients.Kubernetes, kubernetes); err != nil {
return fmt.Errorf("Failed to update sa token in cluster secret: %w", err)
}

if ok, _ := util.KlusterVersionConstraint(kluster, ">= 1.20"); ok {
dynamicKubernetes, err := clients.Satellites.DynamicClientFor(kluster)
if err != nil {
Expand Down Expand Up @@ -410,3 +420,70 @@ func SeedOpenStackClusterRoleBindings(client clientset.Interface) error {

return nil
}

func SeedKubernikusServiceAccount(client clientset.Interface) error {
err := bootstrap.CreateOrUpdateServiceAccount(client, &core_v1.ServiceAccount{
ObjectMeta: metav1.ObjectMeta{
Name: "kubernikus",
Namespace: "kube-system",
},
})
if err != nil {
return fmt.Errorf("Failed to ensure kubernikus serviceaccount: %w", err)
}

return bootstrap.CreateOrUpdateClusterRoleBindingV1(client, &rbac.ClusterRoleBinding{
ObjectMeta: metav1.ObjectMeta{
Name: "kubernikus:monitor",
},
RoleRef: rbac.RoleRef{
APIGroup: rbac.GroupName,
Kind: "ClusterRole",
Name: "system:monitoring",
},
Subjects: []rbac.Subject{
{
Kind: "ServiceAccount",
Name: "kubernikus",
Namespace: "kube-system",
},
},
})
}

func UpdateServiceAccountTokenInSecret(kluster *v1.Kluster, cpClient clientset.Interface, klusterClient clientset.Interface) error {

secretName := ""
err := wait.Poll(50*time.Millisecond, 2*time.Second, func() (bool, error) {
sa, err := klusterClient.CoreV1().ServiceAccounts("kube-system").Get(context.TODO(), "kubernikus", metav1.GetOptions{})
if err != nil {
return false, err
}
if len(sa.Secrets) == 0 {
return false, nil
}
secretName = sa.Secrets[0].Name
return true, nil
})
if err != nil {
return fmt.Errorf("Failed to get secret name from sa: %w", err)
}
klusterSecret, err := util.KlusterSecret(cpClient, kluster)
if err != nil {
return fmt.Errorf("Failed to get kluster secret: %w", err)
}
secret, err := klusterClient.CoreV1().Secrets("kube-system").Get(context.TODO(), secretName, metav1.GetOptions{})
if err != nil {
return fmt.Errorf("Failed to fetch sa secret: %w", err)
}
token, ok := secret.Data["token"]
if !ok {
return errors.New("Secret is missing token field")
}
if klusterSecret.ServiceAccount == string(token) {
return nil
}
klusterSecret.ServiceAccount = string(token)
return util.UpdateKlusterSecret(cpClient, kluster, klusterSecret)

}
26 changes: 0 additions & 26 deletions pkg/controller/ground/bootstrap/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,19 +129,6 @@ func CreateOrUpdateClusterRoleBindingV1(client clientset.Interface, clusterRoleB
return nil
}

func CreateOrUpdateRoleBinding(client clientset.Interface, roleBinding *rbac.RoleBinding) error {
if _, err := client.RbacV1beta1().RoleBindings(roleBinding.Namespace).Create(context.TODO(), roleBinding, metav1.CreateOptions{}); err != nil {
if !apierrors.IsAlreadyExists(err) {
return errors.Wrap(err, "unable to create RBAC rolebinding")
}

if _, err := client.RbacV1beta1().RoleBindings(roleBinding.Namespace).Update(context.TODO(), roleBinding, metav1.UpdateOptions{}); err != nil {
return errors.Wrap(err, "unable to update RBAC rolebinding")
}
}
return nil
}

func CreateOrUpdateRoleBindingV1(client clientset.Interface, roleBinding *rbac_v1.RoleBinding) error {
if _, err := client.RbacV1().RoleBindings(roleBinding.Namespace).Create(context.TODO(), roleBinding, metav1.CreateOptions{}); err != nil {
if !apierrors.IsAlreadyExists(err) {
Expand All @@ -168,19 +155,6 @@ func CreateOrUpdateRole(client clientset.Interface, role *rbac_v1.Role) error {
return nil
}

func CreateOrUpdateClusterRole(client clientset.Interface, clusterRole *rbac.ClusterRole) error {
if _, err := client.RbacV1beta1().ClusterRoles().Create(context.TODO(), clusterRole, metav1.CreateOptions{}); err != nil {
if !apierrors.IsAlreadyExists(err) {
return errors.Wrap(err, "unable to create RBAC clusterrole")
}

if _, err := client.RbacV1beta1().ClusterRoles().Update(context.TODO(), clusterRole, metav1.UpdateOptions{}); err != nil {
return errors.Wrap(err, "unable to update RBAC clusterrole")
}
}
return nil
}

func CreateOrUpdateClusterRoleV1(client clientset.Interface, clusterRole *rbac_v1.ClusterRole) error {
if _, err := client.RbacV1().ClusterRoles().Create(context.TODO(), clusterRole, metav1.CreateOptions{}); err != nil {
if !apierrors.IsAlreadyExists(err) {
Expand Down
23 changes: 23 additions & 0 deletions pkg/migration/22_seed_kubernikus_sa.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
package migration

import (
"fmt"

v1 "github.com/sapcc/kubernikus/pkg/apis/kubernikus/v1"
"github.com/sapcc/kubernikus/pkg/controller/config"
"github.com/sapcc/kubernikus/pkg/controller/ground"
)

func SeedKubernikusServiceAccount(rawKluster []byte, current *v1.Kluster, clients config.Clients, factories config.Factories) (err error) {
kubernetes, err := clients.Satellites.ClientFor(current)
if err != nil {
return err
}
if err := ground.SeedKubernikusServiceAccount(kubernetes); err != nil {
return fmt.Errorf("Failed to seed kubernikus service account: %w", err)
}
if err := ground.UpdateServiceAccountTokenInSecret(current, clients.Kubernetes, kubernetes); err != nil {
return fmt.Errorf("Failed to update sa token in cluster secret: %w", err)
}
return nil
}
1 change: 1 addition & 0 deletions pkg/migration/register.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ func init() {
FixFlannelOnFlatcar,
KlusterSecretOpenStackIds,
Helm2to3,
SeedKubernikusServiceAccount,
// <-- Insert new migrations at the end only!
}
}

0 comments on commit 5e09376

Please sign in to comment.