Add an alert for Locate V2 request errors (#968)

* Add an alert for Locate V2 request errors * Use lower threshold and rate * Modify rate * Add restarts alert * Increase threshold
m-lab · Jan 23, 2023 · 3399465 · 3399465
1 parent 1153f21
commit 3399465
Showing 1 changed file with 30 additions and 0 deletions.
diff --git a/config/federation/prometheus/alerts.yml b/config/federation/prometheus/alerts.yml
@@ -37,6 +37,19 @@ groups:
       description: '{{ $labels.instance }} of job {{ $labels.job }} has been down
         for more than 10 minutes.'
 
+# TooManyProcessRestarts: a process has restarted more than five times in the last hour.
+  - alert: TooManyProcessRestarts
+    expr: resets(process_cpu_seconds_total[1h]) > 5
+    for: 10m
+    labels:
+      repo: dev-tracker
+      severity: ticket
+      cluster: prometheus-federation
+    annotations:
+      summary: Instance {{ $labels.instance }} has restarted more than five times in the
+        last hour.
+      description: https://github.com/m-lab/ops-tracker/wiki/Alerts-&-Troubleshooting#toomanyprocessrestarts
+
 ##
 ## SLOs
 ##
@@ -567,6 +580,23 @@ groups:
       summary: A critical metric about mlab-ns is missing.
       description: https://github.com/m-lab/ops-tracker/wiki/Alerts-&-Troubleshooting#mlabns_serverresponsemetricmissing
 
+# The alert Locate_TooManyErrors will fire when the rate of /nearest requests to the
+# Locate V2 is greater than 0.1% for 2 minutes. The locate_requests_total metrics come
+# from Prometheus.
+  - alert: Locate_TooManyErrors
+    expr: |
+      (sum(rate(locate_requests_total{type="nearest", status!="OK"}[2m]))) /
+      sum(rate(locate_requests_total{type="nearest"}[2m])) > 0.001
+    for: 2m
+    labels:
+      repo: dev-tracker
+      severity: ticket
+      cluster: prometheus-federation
+    annotations:
+      summary: The rate of Locate V2 nearest request errors is greater than 0.1%.
+      description: https://github.com/m-lab/ops-tracker/wiki/Alerts-&-Troubleshooting#locate_toomanyerrors
+      dashboard: https://grafana.mlab-oti.measurementlab.net/d/8O9tInk4k/locate-service?orgId=1&refresh=5m&var-datasource=Prometheus%20%28mlab-oti%29&var-platformdatasource=Platform%20Cluster%20%28mlab-oti%29&var-bigquerydatasource=BigQuery%20%28mlab-oti%29&var-metro=All&var-experiment=ndt&viewPanel=4
+
 # If container logs for a node are missing in Stackdriver for too long, then
 # fire an alert, unless the node is in maintenance or lame-duck mode. This
 # somewhat awkward query discovers cases where the stackdriver metric has been