diff --git a/config/federation/bigquery/bq_gardener_historical.sql.template b/config/federation/bigquery/bq_gardener_historical.sql.template index 2da48a6d..bff27458 100644 --- a/config/federation/bigquery/bq_gardener_historical.sql.template +++ b/config/federation/bigquery/bq_gardener_historical.sql.template @@ -52,14 +52,32 @@ WITH all_types AS ( `{{PROJECT}}.raw_ndt.scamper1` WHERE date > date('2019-03-28') +), + +processed_types AS ( + SELECT + datatype, + COUNT(distinct date) AS value_throughput + FROM + all_types + WHERE + (parseTime BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 24 HOUR) AND CURRENT_TIMESTAMP()) + AND date < DATE_SUB(DATE(parseTime), INTERVAL 1 DAY) -- exclude daily. + GROUP BY + datatype ) SELECT - datatype, COUNT(distinct date) AS value_throughput + datatype, value_throughput +FROM processed_types +UNION ALL +-- export 0 throughput for data types that were not processed +SELECT + all_types.datatype, 0 AS value_throughput FROM - all_types + all_types LEFT JOIN processed_types + ON (all_types.datatype = processed_types.datatype) WHERE - (parseTime BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 24 HOUR) AND CURRENT_TIMESTAMP()) - AND date < DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY) -- exclude daily. + processed_types.datatype IS NULL GROUP BY datatype diff --git a/config/federation/grafana/dashboards/Pipeline_AlternativeSLIs.json b/config/federation/grafana/dashboards/Pipeline_AlternativeSLIs.json new file mode 100644 index 00000000..d68e1917 --- /dev/null +++ b/config/federation/grafana/dashboards/Pipeline_AlternativeSLIs.json @@ -0,0 +1,768 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "iteration": 1642100563404, + "links": [], + "panels": [ + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 8, + "panels": [], + "title": "Freshness", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${PrometheusDS}", + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 17, + "x": 0, + "y": 1 + }, + "hiddenSeries": false, + "id": 25, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum by (datatype) (bq_gardener_daily_done_last_4_days)", + "instant": false, + "interval": "", + "legendFormat": "{{datatype}} ", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Daily Jobs (Last 4 Days)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:371", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:372", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "datasource": "${PrometheusDS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "from": 1, + "result": { + "color": "green", + "index": 0, + "text": "✓" + }, + "to": 100 + }, + "type": "range" + }, + { + "options": { + "from": 0, + "result": { + "color": "red", + "index": 1, + "text": "☓" + }, + "to": 1 + }, + "type": "range" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 6, + "x": 17, + "y": 1 + }, + "id": 24, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "value_and_name" + }, + "pluginVersion": "8.1.1", + "targets": [ + { + "exemplar": true, + "expr": "sum(last_over_time(gardener_jobs_total{status=\"success\", daily=\"true\"}[1d])) by (datatype)", + "interval": "", + "legendFormat": "{{datatype}} ", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Gardener Latest Daily Job Completed", + "type": "stat" + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 11 + }, + "id": 10, + "panels": [], + "title": "Coverage", + "type": "row" + }, + { + "datasource": "${PrometheusDS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0.01 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 12 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "(sum(rate(etl_task_total{status!=\"OK\", cluster=\"data-processing\"}[1h])) by (table) /\nsum(rate(etl_task_total{cluster=\"data-processing\"}[1h])) by (table))", + "interval": "", + "legendFormat": "{{table}}", + "refId": "A" + } + ], + "title": "Parser Failure Rate (Tasks / Hour)", + "type": "timeseries" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${PrometheusDS}", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 12 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "avg": false, + "current": false, + "hideEmpty": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "(sum(rate(gardener_jobs_total{status!=\"success\"}[1d])) by (datatype) /\nsum(rate(gardener_jobs_total[1d])) by (datatype))", + "interval": "", + "legendFormat": "{{datatype}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Gardener Failure Rate (Jobs / Day)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:373", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:374", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 21 + }, + "id": 12, + "panels": [], + "title": "Throughput", + "type": "row" + }, + { + "datasource": "${Gardener2_DS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 22 + }, + "id": 16, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "pluginVersion": "8.1.1", + "targets": [ + { + "exemplar": true, + "expr": "sum by (datatype) (increase(gardener_jobs_total{status=\"success\", daily=\"false\"}[1d]))", + "interval": "", + "legendFormat": "{{datatype}}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Gardener Self-Reported Historical Throughput", + "type": "timeseries" + }, + { + "datasource": "${PrometheusDS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 22 + }, + "id": 18, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "pluginVersion": "8.1.1", + "targets": [ + { + "exemplar": true, + "expr": "sum by (datatype) (bq_gardener_historical_throughput)", + "interval": "", + "legendFormat": "{{datatype}}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "BigQuery Exporter Verified Historical Throughput", + "type": "timeseries" + }, + { + "datasource": "${PrometheusDS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 30 + }, + "id": 19, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "pluginVersion": "8.1.1", + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(etl_task_total{cluster=\"data-processing\"}[1h])) by (table)", + "interval": "", + "legendFormat": "{{table}}", + "refId": "A" + } + ], + "title": "Parser Task Rate (Hourly)", + "type": "timeseries" + } + ], + "refresh": "", + "schemaVersion": 30, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": { + "selected": true, + "text": "mlab-sandbox", + "value": "mlab-sandbox" + }, + "description": null, + "error": null, + "hide": 0, + "includeAll": false, + "label": "Project", + "multi": false, + "name": "project", + "options": [ + { + "selected": true, + "text": "mlab-sandbox", + "value": "mlab-sandbox" + }, + { + "selected": false, + "text": "mlab-staging", + "value": "mlab-staging" + }, + { + "selected": false, + "text": "mlab-oti", + "value": "mlab-oti" + } + ], + "query": "mlab-sandbox,mlab-staging,mlab-oti", + "queryValue": "", + "skipUrlSync": false, + "type": "custom" + }, + { + "current": { + "selected": false, + "text": "Prometheus (mlab-sandbox)", + "value": "Prometheus (mlab-sandbox)" + }, + "description": null, + "error": null, + "hide": 2, + "includeAll": false, + "label": null, + "multi": false, + "name": "PrometheusDS", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "Prometheus \\($project\\)", + "skipUrlSync": false, + "type": "datasource" + }, + { + "current": { + "selected": false, + "text": "Data Processing (mlab-sandbox)", + "value": "Data Processing (mlab-sandbox)" + }, + "description": null, + "error": null, + "hide": 2, + "includeAll": false, + "label": "Gardener 2.0 Datasource", + "multi": false, + "name": "Gardener2_DS", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "Data Processing \\($project\\)", + "skipUrlSync": false, + "type": "datasource" + } + ] + }, + "time": { + "from": "now-2d", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Pipeline SLIs", + "uid": "q4MrNzh7k", + "version": 23 +} diff --git a/config/federation/prometheus/alerts.yml b/config/federation/prometheus/alerts.yml index 38b87b48..b6561763 100644 --- a/config/federation/prometheus/alerts.yml +++ b/config/federation/prometheus/alerts.yml @@ -843,7 +843,7 @@ groups: # deployment, so the timeout for this alert is 3 hours and 10 minutes. - alert: GardenerHistoricalThroughputIsStalled expr: | - increase(gardener_jobs_total{status="success"}[1d] offset 24h) > 0 + increase(gardener_jobs_total{status="success", daily="false"}[1d] offset 24h) > 0 UNLESS ON(datatype) bq_gardener_historical_throughput > 0 for: 190m labels: @@ -855,6 +855,23 @@ groups: description: Gardener runs in the "data-processing" cluster. dashboard: https://grafana.mlab-oti.measurementlab.net/d/eBbUW6oik/pipeline-gardener?var-project=mlab-oti&var-pipelineDatasource=Data%20Processing%20(mlab-oti) +# GardenerConfigDatatypeMissingInGardenerHistoricalThroughputQuery fires when +# a datatype exists in the Gardener config +# but not in the bq_gardener_historical_throughput query. + - alert: GardenerConfigDatatypeMissingInGardenerHistoricalThroughputQuery + expr: | + gardener_config_datatypes UNLESS ON(datatype) bq_gardener_historical_throughput + for: 190m + labels: + repo: dev-tracker + severity: ticket + cluster: prometheus-federation + annotations: + summary: Datatype {{ $labels.datatype }} in Gardener config missing in + bq_gardener_historical_throughput query. + description: https://github.com/m-lab/ops-tracker/wiki/Alerts-&-Troubleshooting#gardenerconfigdatatypemissingingardenerhistoricalthroughputquery + dashboard: https://grafana.mlab-oti.measurementlab.net/d/q4MrNzh7k/pipeline-slis?from=now-2d&to=now&var-project=mlab-oti&var-PrometheusDS=Prometheus%20(mlab-oti)&var-Gardener2_DS=Data%20Processing%20(mlab-oti)&orgId=1 + # GardenerFailureRateTooHighOrMissing fires when the number of failed Gardener jobs # in the last day rises above 1% or the number of total jobs is not reported. - alert: GardenerFailureRateTooHighOrMissing @@ -874,8 +891,8 @@ groups: # ParserFailureRateTooHighOrMissing fires when the number of failed parser tasks # in the last hour rises above 1% or the number of total tasks is not reported. - alert: ParserFailureRateTooHighOrMissing - expr: (sum(rate(etl_task_total{status!="OK", cluster="data-processing"}[1h])) by (datatype) / - sum(rate(etl_task_total{cluster="data-processing"}[1h])) by (datatype)) > 0.01 + expr: (sum(rate(etl_task_total{status!="OK", cluster="data-processing"}[1h])) by (table) / + sum(rate(etl_task_total{cluster="data-processing"}[1h])) by (table)) > 0.01 OR absent(etl_task_total{cluster="data-processing"}) for: 10m labels: @@ -883,7 +900,7 @@ groups: severity: ticket cluster: prometheus-federation annotations: - summary: Parser task failure rate above 1% or missing for {{ $labels.datatype }}. + summary: Parser task failure rate above 1% or missing for {{ $labels.table }}. description: Parsers filtered by cluster="data-processing". dashboard: https://grafana.mlab-oti.measurementlab.net/d/UTgnK-jMz/pipeline-overview?orgId=1&refresh=5m&from=now-2d&to=now&var-project=mlab-oti&var-PrometheusDS=Prometheus%20(mlab-oti)&var-LegacyDS=Data%20Proc%20(mlab-oti)&var-Gardener2_DS=Data%20Processing%20(mlab-oti)&var-states=Finishing&var-states=Processing&var-states2=complete diff --git a/config/federation/prometheus/prometheus.yml.template b/config/federation/prometheus/prometheus.yml.template index ee70732a..41113a65 100644 --- a/config/federation/prometheus/prometheus.yml.template +++ b/config/federation/prometheus/prometheus.yml.template @@ -396,6 +396,7 @@ scrape_configs: - 'kubelet_volume_stats_available_bytes{persistentvolumeclaim="auto-prometheus-ssd0"}' - 'kubelet_volume_stats_capacity_bytes{persistentvolumeclaim="auto-prometheus-ssd0"}' - 'gardener_jobs_total' + - 'gardener_config_datatypes' - 'etl_task_total' static_configs: - targets: ['prometheus-data-processing.{{PROJECT}}.measurementlab.net:9090']