diff --git a/config/federation/grafana/dashboards/MLABNS_PrometheusQueries.json b/config/federation/grafana/dashboards/MLABNS_PrometheusQueries.json index d319788a..4923fb00 100644 --- a/config/federation/grafana/dashboards/MLABNS_PrometheusQueries.json +++ b/config/federation/grafana/dashboards/MLABNS_PrometheusQueries.json @@ -16,7 +16,7 @@ "gnetId": null, "graphTooltip": 0, "id": 357, - "iteration": 1616437301517, + "iteration": 1617995965275, "links": [], "panels": [ { @@ -35,7 +35,7 @@ "fillGradient": 0, "gridPos": { "h": 7, - "w": 8, + "w": 6, "x": 0, "y": 0 }, @@ -130,8 +130,8 @@ "fillGradient": 0, "gridPos": { "h": 7, - "w": 8, - "x": 8, + "w": 6, + "x": 6, "y": 0 }, "hiddenSeries": false, @@ -215,6 +215,7 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "description": "", "fieldConfig": { "defaults": { "custom": {} @@ -225,8 +226,103 @@ "fillGradient": 0, "gridPos": { "h": 7, - "w": 8, - "x": 16, + "w": 6, + "x": 12, + "y": 0 + }, + "hiddenSeries": false, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(\n min by (experiment, machine) (\n probe_success{service=\"neubot$protocol\", experiment=\"neubot\"} OR\n probe_success{service=\"neubot_tls$protocol\", experiment=\"neubot\"} OR\n label_replace(kube_node_spec_taint{cluster=\"platform-cluster\", key=\"lame-duck\"},\n \"experiment\", \"neubot.mlab\", \"\", \"\") != bool 1 OR\n label_replace(gmx_machine_maintenance, \"experiment\", \"neubot.mlab\", \"\", \"\") != bool 1\n )\n) /\ncount(\n probe_success{service=\"neubot$protocol\", experiment=\"neubot\"} unless on(machine)\n (gmx_machine_maintenance == 1 unless on(machine) kube_node_status_condition)\n)", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "neubot $protocol: % up", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, "y": 0 }, "hiddenSeries": false, @@ -320,7 +416,7 @@ "fillGradient": 0, "gridPos": { "h": 7, - "w": 8, + "w": 6, "x": 0, "y": 7 }, @@ -415,8 +511,8 @@ "fillGradient": 0, "gridPos": { "h": 7, - "w": 8, - "x": 8, + "w": 6, + "x": 6, "y": 7 }, "hiddenSeries": false, @@ -494,6 +590,102 @@ "alignLevel": null } }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 7 + }, + "hiddenSeries": false, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(probe_success{service=\"neubot$protocol\", experiment=\"neubot\"})", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "neubot $protocol: probe_success", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, { "aliasColors": {}, "bars": false, @@ -510,8 +702,8 @@ "fillGradient": 0, "gridPos": { "h": 7, - "w": 8, - "x": 16, + "w": 6, + "x": 18, "y": 7 }, "hiddenSeries": false, @@ -605,7 +797,7 @@ "fillGradient": 0, "gridPos": { "h": 7, - "w": 8, + "w": 6, "x": 0, "y": 14 }, @@ -700,8 +892,8 @@ "fillGradient": 0, "gridPos": { "h": 7, - "w": 8, - "x": 16, + "w": 6, + "x": 18, "y": 14 }, "hiddenSeries": false, @@ -794,7 +986,7 @@ "list": [ { "current": { - "selected": false, + "selected": true, "text": "Prometheus (mlab-oti)", "value": "Prometheus (mlab-oti)" }, @@ -815,7 +1007,7 @@ { "allValue": null, "current": { - "selected": false, + "selected": true, "text": "IPv4", "value": "" }, @@ -852,6 +1044,5 @@ "timezone": "", "title": "MLAB-NS: Prometheus Queries", "uid": "T-t8rWwGz", - "version": 16 + "version": 19 } - diff --git a/config/federation/prometheus/alerts.yml b/config/federation/prometheus/alerts.yml index 46803f97..d889affa 100644 --- a/config/federation/prometheus/alerts.yml +++ b/config/federation/prometheus/alerts.yml @@ -252,7 +252,7 @@ groups: severity: ticket annotations: summary: Less than 90% of ndt experiments are online according to mlab-ns. - description: https://github.com/m-lab/ops-tracker/wiki/Alerts-&-Troubleshooting#PlatformCluster_TooManyNDTServersDown + description: https://github.com/m-lab/ops-tracker/wiki/Alerts-&-Troubleshooting#PlatformCluster_TooManyServersDown dashboard: https://grafana.mlab-oti.measurementlab.net/d/T-t8rWwGz/mlab-ns-prometheus-queries # "ndt_ipv6" mlab-ns query @@ -283,7 +283,7 @@ groups: severity: ticket annotations: summary: Less than 75% of ndt_ipv6 experiments are online according to mlab-ns. - description: https://github.com/m-lab/ops-tracker/wiki/Alerts-&-Troubleshooting#PlatformCluster_TooManyNDTServersDown + description: https://github.com/m-lab/ops-tracker/wiki/Alerts-&-Troubleshooting#PlatformCluster_TooManyServersDown dashboard: https://grafana.mlab-oti.measurementlab.net/d/T-t8rWwGz/mlab-ns-prometheus-queries # "ndt_ssl" mlab-ns query @@ -315,7 +315,7 @@ groups: page_project: mlab-oti annotations: summary: Less than 90% of ndt_ssl experiments are online according to mlab-ns. - description: https://github.com/m-lab/ops-tracker/wiki/Alerts-&-Troubleshooting#PlatformCluster_TooManyNDTServersDown + description: https://github.com/m-lab/ops-tracker/wiki/Alerts-&-Troubleshooting#PlatformCluster_TooManyServersDown dashboard: https://grafana.mlab-oti.measurementlab.net/d/T-t8rWwGz/mlab-ns-prometheus-queries # "ndt_ssl_ipv6" mlab-ns query @@ -346,7 +346,7 @@ groups: severity: ticket annotations: summary: Less than 75% of ndt_ssl_ipv6 experiments are online according to mlab-ns. - description: https://github.com/m-lab/ops-tracker/wiki/Alerts-&-Troubleshooting#PlatformCluster_TooManyNDTServersDown + description: https://github.com/m-lab/ops-tracker/wiki/Alerts-&-Troubleshooting#PlatformCluster_TooManyServersDown dashboard: https://grafana.mlab-oti.measurementlab.net/d/T-t8rWwGz/mlab-ns-prometheus-queries # "ndt7" mlab-ns query @@ -377,7 +377,7 @@ groups: severity: ticket annotations: summary: Less than 90% of ndt7 experiments are online according to mlab-ns. - description: https://github.com/m-lab/ops-tracker/wiki/Alerts-&-Troubleshooting#PlatformCluster_TooManyNDTServersDown + description: https://github.com/m-lab/ops-tracker/wiki/Alerts-&-Troubleshooting#PlatformCluster_TooManyServersDown dashboard: https://grafana.mlab-oti.measurementlab.net/d/T-t8rWwGz/mlab-ns-prometheus-queries # "ndt7_ipv6" mlab-ns query @@ -408,7 +408,61 @@ groups: severity: ticket annotations: summary: Less than 75% of ndt7_ipv6 experiments are online according to mlab-ns. - description: https://github.com/m-lab/ops-tracker/wiki/Alerts-&-Troubleshooting#PlatformCluster_TooManyNDTServersDown + description: https://github.com/m-lab/ops-tracker/wiki/Alerts-&-Troubleshooting#PlatformCluster_TooManyServersDown + dashboard: https://grafana.mlab-oti.measurementlab.net/d/T-t8rWwGz/mlab-ns-prometheus-queries + + # "neubot" mlab-ns query + - alert: PlatformCluster_TooManyNeubotIPv4ServersDown + expr: | + ( + sum( + min by (experiment, machine) ( + probe_success{service="neubot"} OR + probe_success{service="neubot_tls"} OR + label_replace(kube_node_spec_taint{cluster="platform-cluster", key="lame-duck"}, + "experiment", "neubot.mlab", "", "") != bool 1 OR + label_replace(gmx_machine_maintenance, "experiment", "neubot.mlab", "", "") != bool 1 + ) + ) / + count( + probe_success{service="neubot"} unless on(machine) + (gmx_machine_maintenance == 1 unless on(machine) kube_node_status_condition) + ) + ) < 0.90 + for: 10m + labels: + repo: ops-tracker + severity: ticket + annotations: + summary: Less than 90% of neubot experiments are online according to mlab-ns. + description: https://github.com/m-lab/ops-tracker/wiki/Alerts-&-Troubleshooting#PlatformCluster_TooManyServersDown + dashboard: https://grafana.mlab-oti.measurementlab.net/d/T-t8rWwGz/mlab-ns-prometheus-queries + + # "neubot_ipv6" mlab-ns query + - alert: PlatformCluster_TooManyNeubotIPv6ServersDown + expr: | + ( + sum( + min by (experiment, machine) ( + probe_success{service="neubot_ipv6"} OR + probe_success{service="neubot_tls_ipv6"} OR + label_replace(kube_node_spec_taint{cluster="platform-cluster", key="lame-duck"}, + "experiment", "neubot.mlab", "", "") != bool 1 OR + label_replace(gmx_machine_maintenance, "experiment", "neubot.mlab", "", "") != bool 1 + ) + ) / + count( + probe_success{service="neubot_ipv6"} unless on(machine) + (gmx_machine_maintenance == 1 unless on(machine) kube_node_status_condition) + ) + ) < 0.75 + for: 10m + labels: + repo: ops-tracker + severity: ticket + annotations: + summary: Less than 75% of neubot_ipv6 experiments are online according to mlab-ns. + description: https://github.com/m-lab/ops-tracker/wiki/Alerts-&-Troubleshooting#PlatformCluster_TooManyServersDown dashboard: https://grafana.mlab-oti.measurementlab.net/d/T-t8rWwGz/mlab-ns-prometheus-queries # Check 5xx errors for the rate-limiter deployment, too.