From 7694de30db867b6ebe85acc5c52718761fa1f7e6 Mon Sep 17 00:00:00 2001 From: nkinkade Date: Tue, 7 Dec 2021 11:36:57 -0600 Subject: [PATCH] Updates node-exporter and cAdvisor images (#852) * Updates node-exporter DaemonSet to v1.2.2 * Removes node-exporter from Prometheus deployment The only reason node-exporter was deployed withing the Prometheus pod was to monitor the peristent volume that is mounted inside the pod. However, this same data is available through cAdvisor, and we will use that instead. * Updates cAdvisor version and enables disk metrics * Adds cAdvisor deployment to data-processing cluster * Removes node-exporter from Prom deployment The metrics that were useful from node-exporter, metrics about the persistent disk, inside the Prom pod will now be gathered by cAdvisor. * Updates node-exporter DaemonSet to v1.2.2 * Restores mistakenly deleted volumes section * Removes nodes_filesystem metrics in favor of containter_fs * Revert "Adds cAdvisor deployment to data-processing cluster" This reverts commit 3d76093f76fcd2439dd8ce3bb4409c59033cf67a. * Uses kubelet instead of cAdvisor metrics * Adds cluster label to all kubernetes-nodes metrics * Updates Prom disk full alert with new metrics * Puts alert template inside double quotes * Updates Filesystem Available Estimate panel Replaces most node-exporter metrics with kubelet metrics about volumes in Filesystem Available Estimate panel of the Prometheus:SelfMonitoring dashboard. * Fixes kubelet volume metric scraped from DP cluster --- .../dashboards/Prometheus_SelfMonitoring.json | 503 +++++++++--------- config/federation/prometheus/alerts.yml | 18 +- .../prometheus/prometheus.yml.template | 10 +- .../deployments/node-exporter.yml | 2 +- .../deployments/prometheus.yml | 52 +- .../deployments/cadvisor.yml | 4 +- .../deployments/node-exporter.yml | 2 +- .../deployments/prometheus.yml | 52 -- 8 files changed, 260 insertions(+), 383 deletions(-) diff --git a/config/federation/grafana/dashboards/Prometheus_SelfMonitoring.json b/config/federation/grafana/dashboards/Prometheus_SelfMonitoring.json index 99d8873a..9c3f6e2b 100644 --- a/config/federation/grafana/dashboards/Prometheus_SelfMonitoring.json +++ b/config/federation/grafana/dashboards/Prometheus_SelfMonitoring.json @@ -15,13 +15,17 @@ "editable": true, "gnetId": null, "graphTooltip": 0, - "id": 297, - "iteration": 1575931594609, + "id": 192, + "iteration": 1638825747083, "links": [], "panels": [ { "collapsed": false, "datasource": null, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, "gridPos": { "h": 1, "w": 24, @@ -35,22 +39,39 @@ }, { "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], "datasource": "$datasource", - "decimals": null, - "format": "s", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] }, "gridPos": { "h": 4, @@ -61,39 +82,23 @@ "id": 40, "interval": null, "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "options": {}, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" }, - "tableColumn": "", + "pluginVersion": "8.1.1", "targets": [ { "expr": "min(time() - process_start_time_seconds{container=\"prometheus\"})", @@ -105,36 +110,44 @@ "step": 1800 } ], - "thresholds": "", "title": "Uptime", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" + "type": "stat" }, { "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] }, "gridPos": { "h": 4, @@ -145,39 +158,23 @@ "id": 41, "interval": null, "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "options": {}, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" }, - "tableColumn": "", + "pluginVersion": "8.1.1", "targets": [ { "expr": "max(prometheus_tsdb_blocks_loaded)", @@ -189,18 +186,8 @@ "step": 1800 } ], - "thresholds": "", "title": "TSDB Blocks in Memory", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" + "type": "stat" }, { "aliasColors": {}, @@ -208,6 +195,12 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -216,6 +209,7 @@ "x": 12, "y": 1 }, + "hiddenSeries": false, "id": 43, "legend": { "alignAsTable": true, @@ -233,9 +227,10 @@ "links": [], "nullPointMode": "null", "options": { - "dataLinks": [] + "alertThreshold": true }, "percentage": false, + "pluginVersion": "8.1.1", "pointradius": 5, "points": false, "renderer": "flot", @@ -308,6 +303,12 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -316,6 +317,7 @@ "x": 0, "y": 5 }, + "hiddenSeries": false, "id": 38, "legend": { "avg": false, @@ -331,9 +333,10 @@ "links": [], "nullPointMode": "null as zero", "options": { - "dataLinks": [] + "alertThreshold": true }, "percentage": false, + "pluginVersion": "8.1.1", "pointradius": 5, "points": false, "renderer": "flot", @@ -403,6 +406,12 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -411,6 +420,7 @@ "x": 8, "y": 5 }, + "hiddenSeries": false, "id": 39, "legend": { "avg": false, @@ -426,9 +436,10 @@ "links": [], "nullPointMode": "null", "options": { - "dataLinks": [] + "alertThreshold": true }, "percentage": false, + "pluginVersion": "8.1.1", "pointradius": 5, "points": false, "renderer": "flot", @@ -501,6 +512,12 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -509,6 +526,7 @@ "x": 16, "y": 5 }, + "hiddenSeries": false, "id": 42, "legend": { "alignAsTable": true, @@ -526,9 +544,10 @@ "links": [], "nullPointMode": "null", "options": { - "dataLinks": [] + "alertThreshold": true }, "percentage": false, + "pluginVersion": "8.1.1", "pointradius": 5, "points": false, "renderer": "flot", @@ -593,14 +612,21 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 7, - "w": 6, + "w": 8, "x": 0, "y": 12 }, + "hiddenSeries": false, "id": 49, "legend": { "alignAsTable": true, @@ -618,9 +644,10 @@ "links": [], "nullPointMode": "null", "options": { - "dataLinks": [] + "alertThreshold": true }, "percentage": false, + "pluginVersion": "8.1.1", "pointradius": 5, "points": false, "renderer": "flot", @@ -713,14 +740,21 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 7, - "w": 6, - "x": 6, + "w": 8, + "x": 8, "y": 12 }, + "hiddenSeries": false, "id": 48, "legend": { "alignAsTable": true, @@ -738,9 +772,10 @@ "links": [], "nullPointMode": "null", "options": { - "dataLinks": [] + "alertThreshold": true }, "percentage": false, + "pluginVersion": "8.1.1", "pointradius": 5, "points": false, "renderer": "flot", @@ -827,112 +862,21 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 7, - "w": 6, - "x": 12, - "y": 12 - }, - "id": 47, - "legend": { - "alignAsTable": true, - "avg": false, - "current": false, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "rate(prometheus_tsdb_head_samples_appended_total[5m])*60", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Rate of Ingested Samples", - "refId": "A" + "fieldConfig": { + "defaults": { + "links": [] }, - { - "expr": "prometheus_tsdb_head_series", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Series in Memory", - "refId": "C" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Samples", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" + "overrides": [] }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", "fill": 1, "fillGradient": 0, "gridPos": { "h": 7, - "w": 6, - "x": 18, + "w": 8, + "x": 16, "y": 12 }, + "hiddenSeries": false, "id": 44, "legend": { "alignAsTable": true, @@ -950,9 +894,10 @@ "links": [], "nullPointMode": "null", "options": { - "dataLinks": [] + "alertThreshold": true }, "percentage": false, + "pluginVersion": "8.1.1", "pointradius": 5, "points": false, "renderer": "flot", @@ -1024,6 +969,12 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -1032,24 +983,28 @@ "x": 0, "y": 19 }, - "id": 26, + "hiddenSeries": false, + "id": 47, "legend": { + "alignAsTable": true, "avg": false, "current": false, - "max": false, + "max": true, "min": false, + "rightSide": true, "show": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "options": { - "dataLinks": [] + "alertThreshold": true }, "percentage": false, + "pluginVersion": "8.1.1", "pointradius": 5, "points": false, "renderer": "flot", @@ -1059,33 +1014,25 @@ "steppedLine": false, "targets": [ { - "expr": "deriv(avg by(mountpoint, workload) (node_filesystem_avail_bytes{mountpoint=\"/mnt/local\", pod!~\".*srglv.*\", cluster!~\".+\"})[1d:10m]) * 86400", + "expr": "rate(prometheus_tsdb_head_samples_appended_total[5m])*60", "format": "time_series", - "hide": false, - "interval": "60s", - "intervalFactor": 4, - "legendFormat": "", - "metric": "", - "refId": "A", - "step": 240 + "intervalFactor": 1, + "legendFormat": "Rate of Ingested Samples", + "refId": "A" }, { - "expr": "deriv(avg by(mountpoint) (node_filesystem_avail_bytes{mountpoint=\"/prometheus\", pod!~\".*srglv.*\"})[1d:10m]) * 86400", + "expr": "prometheus_tsdb_head_series", "format": "time_series", - "hide": false, - "interval": "60s", - "intervalFactor": 4, - "legendFormat": "", - "metric": "", - "refId": "C", - "step": 240 + "intervalFactor": 1, + "legendFormat": "Series in Memory", + "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Daily Filesystem Consumption Rate", + "title": "Samples", "tooltip": { "shared": true, "sort": 0, @@ -1101,11 +1048,11 @@ }, "yaxes": [ { - "format": "bytes", - "label": "", + "format": "short", + "label": null, "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true }, { @@ -1128,6 +1075,12 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, "fill": 0, "fillGradient": 0, "gridPos": { @@ -1136,6 +1089,7 @@ "x": 12, "y": 19 }, + "hiddenSeries": false, "id": 25, "legend": { "avg": false, @@ -1151,9 +1105,10 @@ "links": [], "nullPointMode": "null", "options": { - "dataLinks": [] + "alertThreshold": true }, "percentage": false, + "pluginVersion": "8.1.1", "pointradius": 5, "points": false, "renderer": "flot", @@ -1163,40 +1118,62 @@ "steppedLine": false, "targets": [ { - "expr": "predict_linear(avg by(mountpoint) (node_filesystem_avail_bytes{mountpoint=\"/prometheus\"})[2d:10m], 10*86400)", + "exemplar": true, + "expr": "predict_linear(kubelet_volume_stats_available_bytes{cluster=\"prometheus-federation\", persistentvolumeclaim=\"auto-prometheus-disk0\"}[2d:10m], 10*86400)", "format": "time_series", "hide": false, "interval": "60s", "intervalFactor": 2, - "legendFormat": "{{deployment}} - 10 days", + "legendFormat": "prometheus-federation - 10 days", "refId": "C" }, { - "expr": "node_filesystem_avail_bytes{mountpoint=\"/prometheus\"}", + "exemplar": true, + "expr": "kubelet_volume_stats_available_bytes{cluster=\"prometheus-federation\", persistentvolumeclaim=\"auto-prometheus-disk0\"}", "format": "time_series", "hide": false, "interval": "60s", "intervalFactor": 2, - "legendFormat": "{{deployment}} - raw", + "legendFormat": "prometheus-federation - raw", "refId": "D" }, { - "expr": "predict_linear(avg by(mountpoint) (node_filesystem_avail_bytes{mountpoint=\"/mnt/local\", cluster!~\".+\"})[2d:10m], 10*86400)", + "exemplar": true, + "expr": "predict_linear(node_filesystem_avail_bytes{node=\"prometheus-platform-cluster\", mountpoint=\"/mnt/local\"}[2d:10m], 10*86400)", "format": "time_series", "hide": false, "interval": "60s", "intervalFactor": 2, - "legendFormat": "{{deployment}} - 10 days", + "legendFormat": "platform - 10 days", "refId": "E" }, { - "expr": "node_filesystem_avail_bytes{mountpoint=\"/mnt/local\", cluster!~\".+\"}", + "exemplar": true, + "expr": "node_filesystem_avail_bytes{node=\"prometheus-platform-cluster\", mountpoint=\"/mnt/local\"}", "format": "time_series", "hide": false, "interval": "60s", "intervalFactor": 2, - "legendFormat": "{{deployment}} - raw", + "legendFormat": "platform - raw", "refId": "F" + }, + { + "exemplar": true, + "expr": "predict_linear(kubelet_volume_stats_available_bytes{instance=~\"gke-data-processing.*\", persistentvolumeclaim=\"auto-prometheus-ssd0\"}[2d:10m], 10*86400)", + "format": "time_series", + "hide": false, + "interval": "60s", + "intervalFactor": 2, + "legendFormat": "data-processing - 10 days", + "refId": "A" + }, + { + "exemplar": true, + "expr": "kubelet_volume_stats_available_bytes{instance=~\"gke-data-processing.*\", persistentvolumeclaim=\"auto-prometheus-ssd0\"}", + "hide": false, + "interval": "60s", + "legendFormat": "data-processing - raw", + "refId": "B" } ], "thresholds": [], @@ -1219,6 +1196,7 @@ }, "yaxes": [ { + "$$hashKey": "object:89", "decimals": null, "format": "bytes", "label": "", @@ -1228,6 +1206,7 @@ "show": true }, { + "$$hashKey": "object:90", "format": "short", "label": null, "logBase": 1, @@ -1242,18 +1221,20 @@ } } ], - "refresh": false, - "schemaVersion": 20, + "refresh": "", + "schemaVersion": 30, "style": "dark", "tags": [], "templating": { "list": [ { "current": { - "tags": [], + "selected": true, "text": "Platform Cluster (mlab-oti)", "value": "Platform Cluster (mlab-oti)" }, + "description": null, + "error": null, "hide": 0, "includeAll": false, "label": "Data source", @@ -1261,6 +1242,7 @@ "name": "datasource", "options": [], "query": "prometheus", + "queryValue": "", "refresh": 1, "regex": "", "skipUrlSync": false, @@ -1300,6 +1282,5 @@ "timezone": "utc", "title": "Prometheus: Self-monitoring", "uid": "sVklmeHik", - "version": 139 - + "version": 149 } diff --git a/config/federation/prometheus/alerts.yml b/config/federation/prometheus/alerts.yml index 1d528076..6f8bc57c 100644 --- a/config/federation/prometheus/alerts.yml +++ b/config/federation/prometheus/alerts.yml @@ -1213,23 +1213,17 @@ groups: # that includes its own version of this alert. - alert: PrometheusPersistentDiskTooFull expr: | - ((node_filesystem_avail_bytes{cluster="data-processing", mountpoint="/prometheus"} - / node_filesystem_size_bytes{cluster="data-processing", mountpoint="/prometheus"}) < 0.05) OR - ((node_filesystem_avail_bytes{cluster="prometheus-federation", mountpoint="/prometheus"} - / node_filesystem_size_bytes{cluster="prometheus-federation", mountpoint="/prometheus"}) < 0.05) + ((kubelet_volume_stats_available_bytes{cluster="data-processing", persistentvolumeclaim="auto-prometheus-ssd0"} + / kubelet_volume_stats_capacity_bytes) < 0.05) OR + ((kubelet_volume_stats_available_bytes{cluster="prometheus-federation", persistentvolumeclaim="auto-prometheus-disk0"} + / kubelet_volume_stats_capacity_bytes) < 0.05) for: 1m labels: repo: ops-tracker severity: ticket - cluster: prometheus-federation + cluster: "{{ $labels.cluster }}" annotations: summary: The Prometheus persistent disk has less than 5% free space. - description: > - The Prometheus persistent disk has less than 5% free space. - Investigate filesystem usage on the VM, but most likely if this alert - fires it means that the size of the persistent disk is too small and - may need to be increased. GCE persistent disks can be resized, even on - a running VM. Please refer to the [instructions on how to do this][1]. - [1]: https://github.com/m-lab/k8s-support/blob/master/manage-cluster/PROMETHEUS.md#resizing-the-prometheus-vms-disk + description: https://github.com/m-lab/ops-tracker/wiki/Alerts-&-Troubleshooting#prometheuspersistentdisktoofull dashboard: https://grafana.mlab-oti.measurementlab.net/d/sVklmeHik/prometheus-self-monitoring?orgId=1&var-datasource=default diff --git a/config/federation/prometheus/prometheus.yml.template b/config/federation/prometheus/prometheus.yml.template index 70d644e2..bd844248 100644 --- a/config/federation/prometheus/prometheus.yml.template +++ b/config/federation/prometheus/prometheus.yml.template @@ -138,7 +138,11 @@ scrape_configs: regex: (.+) target_label: __metrics_path__ replacement: /api/v1/nodes/${1}/proxy/metrics - + # Add explicit cluster label to node metrics. + - source_labels: [] + regex: .* + target_label: cluster + replacement: prometheus-federation # kube-state-metrics reports status about k8s objects (pods, nodes, # deployments, etc). We cannot rely on service-discovery because the metric @@ -389,8 +393,8 @@ scrape_configs: 'match[]': - 'up{container="etl-gardener",instance=~".*:9090"}' - 'up{container="etl-parser",instance=~".*:9090"}' - - 'node_filesystem_size_bytes{deployment="node-exporter"}' - - 'node_filesystem_avail_bytes{deployment="node-exporter"}' + - 'kubelet_volume_stats_available_bytes{persistentvolumeclaim="auto-prometheus-ssd0"}' + - 'kubelet_volume_stats_capacity_bytes{persistentvolumeclaim="auto-prometheus-ssd0"}' static_configs: - targets: ['prometheus-data-processing.{{PROJECT}}.measurementlab.net:9090'] diff --git a/k8s/data-processing/deployments/node-exporter.yml b/k8s/data-processing/deployments/node-exporter.yml index 963d1067..9feada9c 100644 --- a/k8s/data-processing/deployments/node-exporter.yml +++ b/k8s/data-processing/deployments/node-exporter.yml @@ -15,7 +15,7 @@ spec: prometheus.io/scrape: 'true' spec: containers: - - image: prom/node-exporter:v1.1.1 + - image: prom/node-exporter:v1.2.2 name: node-exporter ports: - containerPort: 9100 diff --git a/k8s/data-processing/deployments/prometheus.yml b/k8s/data-processing/deployments/prometheus.yml index f0504e7d..ba59f06f 100644 --- a/k8s/data-processing/deployments/prometheus.yml +++ b/k8s/data-processing/deployments/prometheus.yml @@ -105,57 +105,6 @@ spec: - mountPath: /prometheus-config name: prometheus-config - # Run a node-exporter as part of the prometheus-server pod so that it has - # access to the same namespace and volumes as the prometheus-server. This - # allows simple disk usage monitoring of the "/prometheus" mount point. - - image: prom/node-exporter:v0.18.1 - name: node-exporter - # Note: only enable the filesystem collector, and ignore system paths. - args: ["--no-collector.arp", - "--no-collector.bcache", - "--no-collector.bonding", - "--no-collector.conntrack", - "--no-collector.cpu", - "--no-collector.cpufreq", - "--no-collector.diskstats", - "--no-collector.edac", - "--no-collector.entropy", - "--no-collector.filefd", - "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($|/)", - "--no-collector.hwmon", - "--no-collector.infiniband", - "--no-collector.ipvs", - "--no-collector.loadavg", - "--no-collector.mdadm", - "--no-collector.meminfo", - "--no-collector.netclass", - "--no-collector.netdev", - "--no-collector.netstat", - "--no-collector.nfs", - "--no-collector.nfsd", - "--no-collector.pressure", - "--no-collector.sockstat", - "--no-collector.stat", - "--no-collector.textfile", - "--no-collector.time", - "--no-collector.timex", - "--no-collector.uname", - "--no-collector.vmstat", - "--no-collector.xfs", - "--no-collector.zfs"] - ports: - - containerPort: 9100 - resources: - requests: - memory: "10Mi" - cpu: "50m" - limits: - memory: "10Mi" - cpu: "50m" - volumeMounts: - - mountPath: /prometheus - name: prometheus-storage - # Disks created manually, can be named here explicitly using # gcePersistentDisk instead of the persistentVolumeClaim. volumes: @@ -165,3 +114,4 @@ spec: - name: prometheus-config configMap: name: prometheus-cluster-config + diff --git a/k8s/prometheus-federation/deployments/cadvisor.yml b/k8s/prometheus-federation/deployments/cadvisor.yml index c952e8ef..1b9b76f5 100644 --- a/k8s/prometheus-federation/deployments/cadvisor.yml +++ b/k8s/prometheus-federation/deployments/cadvisor.yml @@ -19,7 +19,7 @@ spec: spec: containers: - name: cadvisor - image: k8s.gcr.io/cadvisor:v0.34.0 + image: gcr.io/cadvisor/cadvisor:v0.38.8 args: - --housekeeping_interval=60s - --max_housekeeping_interval=75s @@ -27,7 +27,7 @@ spec: - --event_storage_age_limit=default=0 # Note: tcp,udp stats are very expensive. # Enable only network, diskIO, cpu, memory. - - --disable_metrics=percpu,disk,tcp,udp + - --disable_metrics=percpu,tcp,udp # Only show stats for docker containers. - --docker_only resources: diff --git a/k8s/prometheus-federation/deployments/node-exporter.yml b/k8s/prometheus-federation/deployments/node-exporter.yml index 341e15e0..3fa2b46e 100644 --- a/k8s/prometheus-federation/deployments/node-exporter.yml +++ b/k8s/prometheus-federation/deployments/node-exporter.yml @@ -20,7 +20,7 @@ spec: spec: containers: - name: node-exporter - image: prom/node-exporter:v1.1.2 + image: prom/node-exporter:v1.2.2 args: - --collector.processes ports: diff --git a/k8s/prometheus-federation/deployments/prometheus.yml b/k8s/prometheus-federation/deployments/prometheus.yml index 9c01c0fd..d53eebbb 100644 --- a/k8s/prometheus-federation/deployments/prometheus.yml +++ b/k8s/prometheus-federation/deployments/prometheus.yml @@ -100,58 +100,6 @@ spec: # /etc/prometheus/prometheus.yml contains the M-Lab Prometheus config. - mountPath: /etc/prometheus name: prometheus-config - - # Run a node-exporter as part of the prometheus-server pod so that it has - # access to the same namespace and volumes as the prometheus-server. This - # allows simple disk usage monitoring of the "/prometheus" mount point. - - image: prom/node-exporter:v0.18.1 - name: node-exporter - # Note: only enable the filesystem collector, and ignore system paths. - args: ["--no-collector.arp", - "--no-collector.bcache", - "--no-collector.bonding", - "--no-collector.conntrack", - "--no-collector.cpu", - "--no-collector.cpufreq", - "--no-collector.diskstats", - "--no-collector.edac", - "--no-collector.entropy", - "--no-collector.filefd", - "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($|/)", - "--no-collector.hwmon", - "--no-collector.infiniband", - "--no-collector.ipvs", - "--no-collector.loadavg", - "--no-collector.mdadm", - "--no-collector.meminfo", - "--no-collector.netclass", - "--no-collector.netdev", - "--no-collector.netstat", - "--no-collector.nfs", - "--no-collector.nfsd", - "--no-collector.pressure", - "--no-collector.sockstat", - "--no-collector.stat", - "--no-collector.textfile", - "--no-collector.time", - "--no-collector.timex", - "--no-collector.uname", - "--no-collector.vmstat", - "--no-collector.xfs", - "--no-collector.zfs"] - ports: - - containerPort: 9100 - resources: - requests: - memory: "10Mi" - cpu: "50m" - limits: - memory: "10Mi" - cpu: "50m" - volumeMounts: - - mountPath: /prometheus - name: prometheus-storage - - image: measurementlab/gcp-service-discovery:v1.5.1 name: service-discovery args: [ "--aef-target=/targets/aeflex-targets/aeflex.json",