Updates node-exporter and cAdvisor images (#852)

* Updates node-exporter DaemonSet to v1.2.2 * Removes node-exporter from Prometheus deployment The only reason node-exporter was deployed withing the Prometheus pod was to monitor the peristent volume that is mounted inside the pod. However, this same data is available through cAdvisor, and we will use that instead. * Updates cAdvisor version and enables disk metrics * Adds cAdvisor deployment to data-processing cluster * Removes node-exporter from Prom deployment The metrics that were useful from node-exporter, metrics about the persistent disk, inside the Prom pod will now be gathered by cAdvisor. * Updates node-exporter DaemonSet to v1.2.2 * Restores mistakenly deleted volumes section * Removes nodes_filesystem metrics in favor of containter_fs * Revert "Adds cAdvisor deployment to data-processing cluster" This reverts commit 3d76093. * Uses kubelet instead of cAdvisor metrics * Adds cluster label to all kubernetes-nodes metrics * Updates Prom disk full alert with new metrics * Puts alert template inside double quotes * Updates Filesystem Available Estimate panel Replaces most node-exporter metrics with kubelet metrics about volumes in Filesystem Available Estimate panel of the Prometheus:SelfMonitoring dashboard. * Fixes kubelet volume metric scraped from DP cluster
m-lab · Dec 7, 2021 · 7694de3 · 7694de3
1 parent b4a01fd
commit 7694de3
Show file tree

Hide file tree

Showing 8 changed files with 260 additions and 383 deletions.
diff --git a/config/federation/grafana/dashboards/Prometheus_SelfMonitoring.json b/config/federation/grafana/dashboards/Prometheus_SelfMonitoring.json
diff --git a/config/federation/prometheus/alerts.yml b/config/federation/prometheus/alerts.yml
@@ -1213,23 +1213,17 @@ groups:
 # that includes its own version of this alert.
   - alert: PrometheusPersistentDiskTooFull
     expr: |
-      ((node_filesystem_avail_bytes{cluster="data-processing", mountpoint="/prometheus"}
-        / node_filesystem_size_bytes{cluster="data-processing", mountpoint="/prometheus"}) < 0.05) OR
-      ((node_filesystem_avail_bytes{cluster="prometheus-federation", mountpoint="/prometheus"}
-        / node_filesystem_size_bytes{cluster="prometheus-federation", mountpoint="/prometheus"}) < 0.05)
+      ((kubelet_volume_stats_available_bytes{cluster="data-processing", persistentvolumeclaim="auto-prometheus-ssd0"}
+        / kubelet_volume_stats_capacity_bytes) < 0.05) OR
+      ((kubelet_volume_stats_available_bytes{cluster="prometheus-federation", persistentvolumeclaim="auto-prometheus-disk0"}
+        / kubelet_volume_stats_capacity_bytes) < 0.05)
     for: 1m
     labels:
       repo: ops-tracker
       severity: ticket
-      cluster: prometheus-federation
+      cluster: "{{ $labels.cluster }}"
     annotations:
       summary: The Prometheus persistent disk has less than 5% free space.
-      description: >
-        The Prometheus persistent disk has less than 5% free space.
-        Investigate filesystem usage on the VM, but most likely if this alert
-        fires it means that the size of the persistent disk is too small and
-        may need to be increased. GCE persistent disks can be resized, even on
-        a running VM. Please refer to the [instructions on how to do this][1].
-        [1]: https://github.com/m-lab/k8s-support/blob/master/manage-cluster/PROMETHEUS.md#resizing-the-prometheus-vms-disk
+      description: https://github.com/m-lab/ops-tracker/wiki/Alerts-&-Troubleshooting#prometheuspersistentdisktoofull
       dashboard: https://grafana.mlab-oti.measurementlab.net/d/sVklmeHik/prometheus-self-monitoring?orgId=1&var-datasource=default
 
diff --git a/config/federation/prometheus/prometheus.yml.template b/config/federation/prometheus/prometheus.yml.template
@@ -138,7 +138,11 @@ scrape_configs:
         regex: (.+)
         target_label: __metrics_path__
         replacement: /api/v1/nodes/${1}/proxy/metrics
-
+      # Add explicit cluster label to node metrics.
+      - source_labels: []
+        regex: .*
+        target_label: cluster
+        replacement: prometheus-federation
 
   # kube-state-metrics reports status about k8s objects (pods, nodes,
   # deployments, etc). We cannot rely on service-discovery because the metric
@@ -389,8 +393,8 @@ scrape_configs:
       'match[]':
         - 'up{container="etl-gardener",instance=~".*:9090"}'
         - 'up{container="etl-parser",instance=~".*:9090"}'
-        - 'node_filesystem_size_bytes{deployment="node-exporter"}'
-        - 'node_filesystem_avail_bytes{deployment="node-exporter"}'
+        - 'kubelet_volume_stats_available_bytes{persistentvolumeclaim="auto-prometheus-ssd0"}'
+        - 'kubelet_volume_stats_capacity_bytes{persistentvolumeclaim="auto-prometheus-ssd0"}'
     static_configs:
       - targets: ['prometheus-data-processing.{{PROJECT}}.measurementlab.net:9090']
 

diff --git a/k8s/data-processing/deployments/node-exporter.yml b/k8s/data-processing/deployments/node-exporter.yml
@@ -15,7 +15,7 @@ spec:
         prometheus.io/scrape: 'true'
     spec:
       containers:
-      - image: prom/node-exporter:v1.1.1
+      - image: prom/node-exporter:v1.2.2
         name: node-exporter
         ports:
         - containerPort: 9100

diff --git a/k8s/data-processing/deployments/prometheus.yml b/k8s/data-processing/deployments/prometheus.yml
@@ -105,57 +105,6 @@ spec:
         - mountPath: /prometheus-config
           name: prometheus-config
 
-      # Run a node-exporter as part of the prometheus-server pod so that it has
-      # access to the same namespace and volumes as the prometheus-server. This
-      # allows simple disk usage monitoring of the "/prometheus" mount point.
-      - image: prom/node-exporter:v0.18.1
-        name: node-exporter
-        # Note: only enable the filesystem collector, and ignore system paths.
-        args: ["--no-collector.arp",
-               "--no-collector.bcache",
-               "--no-collector.bonding",
-               "--no-collector.conntrack",
-               "--no-collector.cpu",
-               "--no-collector.cpufreq",
-               "--no-collector.diskstats",
-               "--no-collector.edac",
-               "--no-collector.entropy",
-               "--no-collector.filefd",
-               "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($|/)",
-               "--no-collector.hwmon",
-               "--no-collector.infiniband",
-               "--no-collector.ipvs",
-               "--no-collector.loadavg",
-               "--no-collector.mdadm",
-               "--no-collector.meminfo",
-               "--no-collector.netclass",
-               "--no-collector.netdev",
-               "--no-collector.netstat",
-               "--no-collector.nfs",
-               "--no-collector.nfsd",
-               "--no-collector.pressure",
-               "--no-collector.sockstat",
-               "--no-collector.stat",
-               "--no-collector.textfile",
-               "--no-collector.time",
-               "--no-collector.timex",
-               "--no-collector.uname",
-               "--no-collector.vmstat",
-               "--no-collector.xfs",
-               "--no-collector.zfs"]
-        ports:
-          - containerPort: 9100
-        resources:
-          requests:
-            memory: "10Mi"
-            cpu: "50m"
-          limits:
-            memory: "10Mi"
-            cpu: "50m"
-        volumeMounts:
-        - mountPath: /prometheus
-          name: prometheus-storage
-
       # Disks created manually, can be named here explicitly using
       # gcePersistentDisk instead of the persistentVolumeClaim.
       volumes:
@@ -165,3 +114,4 @@ spec:
       - name: prometheus-config
         configMap:
           name: prometheus-cluster-config
+
diff --git a/k8s/prometheus-federation/deployments/cadvisor.yml b/k8s/prometheus-federation/deployments/cadvisor.yml
@@ -19,15 +19,15 @@ spec:
     spec:
       containers:
       - name: cadvisor
-        image: k8s.gcr.io/cadvisor:v0.34.0
+        image: gcr.io/cadvisor/cadvisor:v0.38.8
         args:
           - --housekeeping_interval=60s
           - --max_housekeeping_interval=75s
           - --event_storage_event_limit=default=0
           - --event_storage_age_limit=default=0
           # Note: tcp,udp stats are very expensive.
           # Enable only network, diskIO, cpu, memory.
-          - --disable_metrics=percpu,disk,tcp,udp
+          - --disable_metrics=percpu,tcp,udp
           # Only show stats for docker containers.
           - --docker_only
         resources:

diff --git a/k8s/prometheus-federation/deployments/node-exporter.yml b/k8s/prometheus-federation/deployments/node-exporter.yml
@@ -20,7 +20,7 @@ spec:
     spec:
       containers:
       - name: node-exporter
-        image: prom/node-exporter:v1.1.2
+        image: prom/node-exporter:v1.2.2
         args:
         - --collector.processes
         ports:

diff --git a/k8s/prometheus-federation/deployments/prometheus.yml b/k8s/prometheus-federation/deployments/prometheus.yml
@@ -100,58 +100,6 @@ spec:
         # /etc/prometheus/prometheus.yml contains the M-Lab Prometheus config.
         - mountPath: /etc/prometheus
           name: prometheus-config
-
-      # Run a node-exporter as part of the prometheus-server pod so that it has
-      # access to the same namespace and volumes as the prometheus-server. This
-      # allows simple disk usage monitoring of the "/prometheus" mount point.
-      - image: prom/node-exporter:v0.18.1
-        name: node-exporter
-        # Note: only enable the filesystem collector, and ignore system paths.
-        args: ["--no-collector.arp",
-               "--no-collector.bcache",
-               "--no-collector.bonding",
-               "--no-collector.conntrack",
-               "--no-collector.cpu",
-               "--no-collector.cpufreq",
-               "--no-collector.diskstats",
-               "--no-collector.edac",
-               "--no-collector.entropy",
-               "--no-collector.filefd",
-               "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($|/)",
-               "--no-collector.hwmon",
-               "--no-collector.infiniband",
-               "--no-collector.ipvs",
-               "--no-collector.loadavg",
-               "--no-collector.mdadm",
-               "--no-collector.meminfo",
-               "--no-collector.netclass",
-               "--no-collector.netdev",
-               "--no-collector.netstat",
-               "--no-collector.nfs",
-               "--no-collector.nfsd",
-               "--no-collector.pressure",
-               "--no-collector.sockstat",
-               "--no-collector.stat",
-               "--no-collector.textfile",
-               "--no-collector.time",
-               "--no-collector.timex",
-               "--no-collector.uname",
-               "--no-collector.vmstat",
-               "--no-collector.xfs",
-               "--no-collector.zfs"]
-        ports:
-          - containerPort: 9100
-        resources:
-          requests:
-            memory: "10Mi"
-            cpu: "50m"
-          limits:
-            memory: "10Mi"
-            cpu: "50m"
-        volumeMounts:
-        - mountPath: /prometheus
-          name: prometheus-storage
-
       - image: measurementlab/gcp-service-discovery:v1.5.1
         name: service-discovery
         args: [ "--aef-target=/targets/aeflex-targets/aeflex.json",