From 79ecad63075568c059931d132e5d8efc756cdac3 Mon Sep 17 00:00:00 2001 From: Michael Montgomery Date: Mon, 12 Aug 2024 09:47:24 -0500 Subject: [PATCH] Introduce metrics for lvmd process. Signed-off-by: Michael Montgomery --- charts/topolvm/README.md | 9 +++ charts/topolvm/templates/lvmd/daemonset.yaml | 9 +++ charts/topolvm/templates/lvmd/podmonitor.yaml | 37 ++++++++++++ charts/topolvm/values.yaml | 40 +++++++++++++ cmd/lvmd/app/root.go | 57 ++++++++++++++++--- test/e2e/metrics_test.go | 27 +++++++++ 6 files changed, 170 insertions(+), 9 deletions(-) create mode 100644 charts/topolvm/templates/lvmd/podmonitor.yaml diff --git a/charts/topolvm/README.md b/charts/topolvm/README.md index e3ebecaab..7e89d9ca8 100644 --- a/charts/topolvm/README.md +++ b/charts/topolvm/README.md @@ -72,10 +72,19 @@ See [Getting Started](https://github.com/topolvm/topolvm/blob/topolvm-chart-v15. | lvmd.labels | object | `{}` | Additional labels to be added to the Daemonset. | | lvmd.lvcreateOptionClasses | list | `[]` | Specify the lvcreate-option-class settings. | | lvmd.managed | bool | `true` | If true, set up lvmd service with DaemonSet. | +| lvmd.metrics.annotations | object | `{"prometheus.io/port":"metrics"}` | Annotations for Scrape used by Prometheus. | +| lvmd.metrics.enabled | bool | `true` | If true, enable scraping of metrics by Prometheus. | | lvmd.nodeSelector | object | `{}` | Specify nodeSelector. # ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ | | lvmd.podLabels | object | `{}` | Additional labels to be set on the lvmd service pods. | | lvmd.priorityClassName | string | `nil` | Specify priorityClassName. | | lvmd.profiling.bindAddress | string | `""` | Enables pprof profiling server. If empty, profiling is disabled. | +| lvmd.prometheus.podMonitor.additionalLabels | object | `{}` | Additional labels that can be used so PodMonitor will be discovered by Prometheus. | +| lvmd.prometheus.podMonitor.enabled | bool | `false` | Set this to `true` to create PodMonitor for Prometheus operator. | +| lvmd.prometheus.podMonitor.interval | string | `""` | Scrape interval. If not set, the Prometheus default scrape interval is used. | +| lvmd.prometheus.podMonitor.metricRelabelings | list | `[]` | MetricRelabelConfigs to apply to samples before ingestion. | +| lvmd.prometheus.podMonitor.namespace | string | `""` | Optional namespace in which to create PodMonitor. | +| lvmd.prometheus.podMonitor.relabelings | list | `[]` | RelabelConfigs to apply to samples before scraping. | +| lvmd.prometheus.podMonitor.scrapeTimeout | string | `""` | Scrape timeout. If not set, the Prometheus default scrape timeout is used. | | lvmd.socketName | string | `"/run/topolvm/lvmd.sock"` | Specify socketName. | | lvmd.tolerations | list | `[]` | Specify tolerations. # ref: https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/ | | lvmd.updateStrategy | object | `{}` | Specify updateStrategy. | diff --git a/charts/topolvm/templates/lvmd/daemonset.yaml b/charts/topolvm/templates/lvmd/daemonset.yaml index f4104f1b7..0eedc6c5b 100644 --- a/charts/topolvm/templates/lvmd/daemonset.yaml +++ b/charts/topolvm/templates/lvmd/daemonset.yaml @@ -36,6 +36,11 @@ spec: {{- end }} annotations: checksum/config: {{ include (print $.Template.BasePath "/lvmd/configmap.yaml") . | sha256sum }} + {{- if .Values.lvmd.metrics.enabed }} + {{- with .Values.lvmd.metrics.annotations }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} spec: {{- with .Values.lvmd.priorityClassName }} priorityClassName: {{ . }} @@ -86,6 +91,10 @@ spec: {{- with .Values.livenessProbe.lvmd.periodSeconds }} periodSeconds: {{ . }} {{- end }} + ports: + - name: metrics + containerPort: 8080 + protocol: TCP {{- with .Values.resources.lvmd }} resources: {{ toYaml . | nindent 12 }} {{- end }} diff --git a/charts/topolvm/templates/lvmd/podmonitor.yaml b/charts/topolvm/templates/lvmd/podmonitor.yaml new file mode 100644 index 000000000..a6dedf132 --- /dev/null +++ b/charts/topolvm/templates/lvmd/podmonitor.yaml @@ -0,0 +1,37 @@ +{{- if .Values.lvmd.prometheus.podMonitor.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: {{ template "topolvm.fullname" . }}-lvmd + namespace: {{ .Values.lvmd.prometheus.podMonitor.namespace | default .Release.Namespace }} + labels: + {{- include "topolvm.labels" . | nindent 4 }} + {{- with .Values.lvmd.prometheus.podMonitor.additionalLabels }} + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + selector: + matchLabels: + app.kubernetes.io/component: lvmd + {{ include "topolvm.selectorLabels" . | nindent 6 }} + namespaceSelector: + matchNames: + - {{ .Release.Namespace }} + podMetricsEndpoints: + - path: /metrics + port: metrics + {{- with .Values.lvmd.prometheus.podMonitor.interval }} + interval: {{ . }} + {{- end }} + {{- with .Values.lvmd.prometheus.podMonitor.scrapeTimeout }} + scrapeTimeout: {{ . }} + {{- end }} + {{- with .Values.lvmd.prometheus.podMonitor.relabelings }} + relabelings: + {{- toYaml . | nindent 6 }} + {{- end }} + {{- with .Values.lvmd.prometheus.podMonitor.metricRelabelings }} + metricRelabelings: + {{- toYaml . | nindent 6 }} + {{- end }} +{{- end }} diff --git a/charts/topolvm/values.yaml b/charts/topolvm/values.yaml index e92a98867..e0a4af008 100644 --- a/charts/topolvm/values.yaml +++ b/charts/topolvm/values.yaml @@ -221,6 +221,46 @@ lvmd: # lvmd.profiling.bindAddress -- Enables pprof profiling server. If empty, profiling is disabled. bindAddress: "" + metrics: + # lvmd.metrics.enabled -- If true, enable scraping of metrics by Prometheus. + enabled: true + # lvmd.metrics.annotations -- Annotations for Scrape used by Prometheus. + annotations: + prometheus.io/port: metrics + + prometheus: + podMonitor: + # lvmd.prometheus.podMonitor.enabled -- Set this to `true` to create PodMonitor for Prometheus operator. + enabled: false + + # lvmd.prometheus.podMonitor.additionalLabels -- Additional labels that can be used so PodMonitor will be discovered by Prometheus. + additionalLabels: {} + + # lvmd.prometheus.podMonitor.namespace -- Optional namespace in which to create PodMonitor. + namespace: "" + + # lvmd.prometheus.podMonitor.interval -- Scrape interval. If not set, the Prometheus default scrape interval is used. + interval: "" + + # lvmd.prometheus.podMonitor.scrapeTimeout -- Scrape timeout. If not set, the Prometheus default scrape timeout is used. + scrapeTimeout: "" + + # lvmd.prometheus.podMonitor.relabelings -- RelabelConfigs to apply to samples before scraping. + relabelings: [] + # - sourceLabels: [__meta_kubernetes_service_label_cluster] + # targetLabel: cluster + # regex: (.*) + # replacement: ${1} + # action: replace + + # lvmd.prometheus.podMonitor.metricRelabelings -- MetricRelabelConfigs to apply to samples before ingestion. + metricRelabelings: [] + # - sourceLabels: [__meta_kubernetes_service_label_cluster] + # targetLabel: cluster + # regex: (.*) + # replacement: ${1} + # action: replace + # CSI node service node: # node.lvmdEmbedded -- Specify whether to embed lvmd in the node container. diff --git a/cmd/lvmd/app/root.go b/cmd/lvmd/app/root.go index 4af41b70e..8de9e96d2 100644 --- a/cmd/lvmd/app/root.go +++ b/cmd/lvmd/app/root.go @@ -10,9 +10,12 @@ import ( "os" "os/signal" "path/filepath" + "sync" "syscall" "time" + "github.com/go-logr/logr" + "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/spf13/cobra" "github.com/topolvm/topolvm" "github.com/topolvm/topolvm/internal/lvmd" @@ -33,6 +36,7 @@ var ( lvmPath string zapOpts zap.Options profilingBindAddress string + metricsBindAddress string ) // rootCmd represents the base command when called without any subcommands @@ -112,15 +116,7 @@ func subMain(parentCtx context.Context) error { ctx, stop := signal.NotifyContext(parentCtx, os.Interrupt, syscall.SIGTERM) defer stop() - var pprofServer *http.Server - if profilingBindAddress != "" { - pprofServer = profiling.NewProfilingServer(profilingBindAddress) - go func() { - if err := pprofServer.ListenAndServe(); !errors.Is(err, http.ErrServerClosed) { - logger.Error(err, "pprof server error") - } - }() - } + wg, pprofServer, metricsServer := startMetricsAndProfilingServers(logger) go func() { ticker := time.NewTicker(10 * time.Minute) @@ -133,7 +129,13 @@ func subMain(parentCtx context.Context) error { logger.Error(err, "failed to shutdown pprof server") } } + if metricsServer != nil { + if err := metricsServer.Shutdown(parentCtx); err != nil { + logger.Error(err, "failed to shutdown metrics server") + } + } grpcServer.GracefulStop() + wg.Wait() return case <-ticker.C: notifier() @@ -144,6 +146,42 @@ func subMain(parentCtx context.Context) error { return grpcServer.Serve(lis) } +// startMetricsAndProfilingServers starts metrics and profiling servers if the bind addresses are set +// and returns a wait group to wait for the servers to stop. +func startMetricsAndProfilingServers(logger logr.Logger) (*sync.WaitGroup, *http.Server, *http.Server) { + var wg sync.WaitGroup + var pprofServer *http.Server + if profilingBindAddress != "" { + wg.Add(1) + pprofServer = profiling.NewProfilingServer(profilingBindAddress) + go func() { + defer wg.Done() + if err := pprofServer.ListenAndServe(); !errors.Is(err, http.ErrServerClosed) { + logger.Error(err, "pprof server error") + } + }() + } + + var metricsServer *http.Server + if metricsBindAddress != "" { + wg.Add(1) + mux := http.NewServeMux() + mux.Handle("/metrics", promhttp.Handler()) + metricsServer = &http.Server{ + Addr: metricsBindAddress, + Handler: mux, + } + go func() { + defer wg.Done() + if err := metricsServer.ListenAndServe(); !errors.Is(err, http.ErrServerClosed) { + logger.Error(err, "metrics server error") + } + }() + } + + return &wg, pprofServer, metricsServer +} + // Execute adds all child commands to the root command and sets flags appropriately. // This is called by main.main(). It only needs to happen once to the rootCmd. func Execute() { @@ -158,6 +196,7 @@ func init() { rootCmd.PersistentFlags().StringVar(&cfgFilePath, "config", filepath.Join("/etc", "topolvm", "lvmd.yaml"), "config file") rootCmd.PersistentFlags().StringVar(&lvmPath, "lvm-path", "", "lvm command path on the host OS") rootCmd.PersistentFlags().StringVar(&profilingBindAddress, "profiling-bind-address", "", "bind address to expose pprof profiling. If empty, profiling is disabled") + rootCmd.PersistentFlags().StringVar(&metricsBindAddress, "metrics-bind-address", ":8080", "bind address to expose prometheus metrics. If empty, metrics are disabled") goflags := flag.NewFlagSet("klog", flag.ExitOnError) klog.InitFlags(goflags) diff --git a/test/e2e/metrics_test.go b/test/e2e/metrics_test.go index 7e411f868..f253f85f1 100644 --- a/test/e2e/metrics_test.go +++ b/test/e2e/metrics_test.go @@ -17,6 +17,7 @@ import ( lvmdApp "github.com/topolvm/topolvm/cmd/lvmd/app" "github.com/topolvm/topolvm/internal/lvmd" lvmdTypes "github.com/topolvm/topolvm/pkg/lvmd/types" + appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" "sigs.k8s.io/yaml" ) @@ -143,6 +144,32 @@ func testMetrics() { } }) }) + + Describe("topolvm-lvmd", func() { + It("should open ports for metrics", func() { + managed, err := isLVMManaged() + Expect(err).ShouldNot(HaveOccurred()) + if managed { + Eventually(func() error { + _, err := kubectl("exec", "-n", "topolvm-system", "daemonset/topolvm-lvmd-0", "-c=lvmd", "--", + "curl", "http://localhost:8080/metrics") + return err + }).Should(Succeed()) + } + }) + }) +} + +func isLVMManaged() (bool, error) { + var ds appsv1.DaemonSet + err := getObjects(&ds, "-n", "topolvm-system", "daemonset/topolvm-lvmd-0") + if err == ErrObjectNotFound { + return false, nil + } + if err != nil { + return false, fmt.Errorf("failed to get DaemonSet: %w", err) + } + return true, nil } func getMetricsFamily(nodeIP string) (map[string]*dto.MetricFamily, error) {