From ff2dfc9e21e4a2a6f66d3c48dab59df42d3ad748 Mon Sep 17 00:00:00 2001 From: Satyam Bhardwaj Date: Wed, 14 Jun 2023 17:59:06 +0530 Subject: [PATCH] add Pipelines Controller Observational Dashboard The goal is to get a better understanding of how our components behave under load so that we can make appropriate tunings/adjustments. The following metrics are being queried for the dashboard:- * tekton_pipelines_controller_running_pipelineruns_count - Number of pipelineruns executing currently * tekton_pipelines_controller_running_taskruns_count - Number of taskruns executing currentl * tekton_pipelines_controller_taskruns_pod_latency Scheduling latency for the taskruns pods * tekton_pipelines_controller_workqueue_depth - The workqueue for the Tekton Pipelines controller * Pipelines controller restarts * tekton_pipelines_controller_client_resutls - Total number of K8s API requests made by Pipelines Client Signed-off-by: Satyam Bhardwaj --- .../pipeline-service-dashboard.json | 636 ++++++++++++++++-- 1 file changed, 572 insertions(+), 64 deletions(-) diff --git a/operator/gitops/argocd/grafana/dashboards/pipeline-service-dashboard.json b/operator/gitops/argocd/grafana/dashboards/pipeline-service-dashboard.json index 2e07ea1bf..934e0e4b8 100644 --- a/operator/gitops/argocd/grafana/dashboards/pipeline-service-dashboard.json +++ b/operator/gitops/argocd/grafana/dashboards/pipeline-service-dashboard.json @@ -21,7 +21,7 @@ "gnetId": null, "graphTooltip": 0, "id": 18, - "iteration": 1687847255863, + "iteration": 1688375985019, "links": [], "panels": [ { @@ -398,7 +398,7 @@ "x": 0, "y": 18 }, - "id": 17, + "id": 11, "panels": [], "repeat": "datasource", "scopedVars": { @@ -408,7 +408,7 @@ "value": "prometheus-appstudio-ds" } }, - "title": "Pipelines-as-code Monitoring", + "title": "Pipelines-Controller Resource Utlization", "type": "row" }, { @@ -417,7 +417,7 @@ "dashLength": 10, "dashes": false, "datasource": null, - "description": "Total average CPU usage of the Pipelines-as-code watcher and controller pods over the past hour.", + "description": "Total average CPU usage of the Pipelines over the past hour.", "fieldConfig": { "defaults": {}, "overrides": [] @@ -431,13 +431,12 @@ "y": 19 }, "hiddenSeries": false, - "id": 21, + "id": 13, "legend": { "avg": false, "current": false, "max": false, "min": false, - "rightSide": true, "show": true, "total": false, "values": false @@ -453,6 +452,13 @@ "pointradius": 2, "points": false, "renderer": "flot", + "scopedVars": { + "datasource": { + "selected": true, + "text": "prometheus-appstudio-ds", + "value": "prometheus-appstudio-ds" + } + }, "seriesOverrides": [], "spaceLength": 10, "stack": false, @@ -460,7 +466,8 @@ "targets": [ { "exemplar": true, - "expr": "sum by (pod) (rate(container_cpu_usage_seconds_total{pod=~\"pipelines-as-code-(watcher|controller)-.*\"}[5m]))*100", + "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"tekton-pipelines-controller-.*\"}[5m]))", + "format": "table", "interval": "", "legendFormat": "", "refId": "A" @@ -486,7 +493,6 @@ }, "yaxes": [ { - "$$hashKey": "object:171", "format": "short", "label": null, "logBase": 1, @@ -495,7 +501,6 @@ "show": true }, { - "$$hashKey": "object:172", "format": "short", "label": null, "logBase": 1, @@ -515,7 +520,7 @@ "dashLength": 10, "dashes": false, "datasource": null, - "description": "Working set memory usage of Pipelines-as-code watcher and controller pods over the past hour.", + "description": "Working set memory usage of Pipelines Controller Containers ", "fieldConfig": { "defaults": {}, "overrides": [] @@ -529,13 +534,12 @@ "y": 19 }, "hiddenSeries": false, - "id": 19, + "id": 15, "legend": { "avg": false, "current": false, "max": false, "min": false, - "rightSide": true, "show": true, "total": false, "values": false @@ -551,6 +555,13 @@ "pointradius": 2, "points": false, "renderer": "flot", + "scopedVars": { + "datasource": { + "selected": true, + "text": "prometheus-appstudio-ds", + "value": "prometheus-appstudio-ds" + } + }, "seriesOverrides": [], "spaceLength": 10, "stack": false, @@ -558,7 +569,7 @@ "targets": [ { "exemplar": true, - "expr": "sum by (pod) (container_memory_working_set_bytes{pod=~\"pipelines-as-code-(watcher|controller)-.*\"})", + "expr": "sum(container_memory_working_set_bytes{pod=~\"tekton-pipelines-controller-.*\"})", "interval": "", "legendFormat": "", "refId": "A" @@ -584,7 +595,6 @@ }, "yaxes": [ { - "$$hashKey": "object:80", "format": "short", "label": null, "logBase": 1, @@ -593,7 +603,6 @@ "show": true }, { - "$$hashKey": "object:81", "format": "short", "label": null, "logBase": 1, @@ -607,13 +616,155 @@ "alignLevel": null } }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 27 + }, + "id": 29, + "panels": [], + "repeat": "datasource", + "scopedVars": { + "datasource": { + "selected": true, + "text": "prometheus-appstudio-ds", + "value": "prometheus-appstudio-ds" + } + }, + "title": "Pipelines Controller Monitoring", + "type": "row" + }, + { + "datasource": null, + "description": "Number of Pipelineruns executing currently", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 6, + "x": 0, + "y": 28 + }, + "id": 31, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.17", + "targets": [ + { + "exemplar": true, + "expr": "tekton_pipelines_controller_running_pipelineruns_count{prometheus=\"openshift-user-workload-monitoring/user-workload\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Pipelineruns Running", + "type": "stat" + }, + { + "datasource": null, + "description": "Number of Taskruns executing currently", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 6, + "x": 6, + "y": 28 + }, + "id": 33, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.17", + "targets": [ + { + "exemplar": true, + "expr": "tekton_pipelines_controller_running_taskruns_count{prometheus=\"openshift-user-workload-monitoring/user-workload\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Taskruns Running", + "type": "stat" + }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": null, - "description": "Number of Pipelineruns watched on pull_request and push events", + "description": "Frequency of Pipelines Controller restarts (This is experimental and may be removed in future)", "fieldConfig": { "defaults": {}, "overrides": [] @@ -621,19 +772,20 @@ "fill": 1, "fillGradient": 0, "gridPos": { - "h": 8, + "h": 10, "w": 12, - "x": 0, - "y": 27 + "x": 12, + "y": 28 }, "hiddenSeries": false, - "id": 27, + "id": 35, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, - "rightSide": true, + "rightSide": false, "show": true, "total": false, "values": false @@ -656,7 +808,7 @@ "targets": [ { "exemplar": true, - "expr": "sum by (event_type) (rate(pac_watcher_pipelines_as_code_pipelinerun_count{prometheus=\"openshift-user-workload-monitoring/user-workload\"}[1h]))*3600", + "expr": "rate(kube_pod_container_status_restarts_total{namespace=\"openshift-pipelines\", pod=~\"tekton-pipelines-controller-.*\"}[7d])", "interval": "", "legendFormat": "", "refId": "A" @@ -666,7 +818,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "PipelineRuns Watched Per Hour", + "title": "Pipelines Controller Container Restarts", "tooltip": { "shared": true, "sort": 0, @@ -682,7 +834,6 @@ }, "yaxes": [ { - "$$hashKey": "object:440", "format": "short", "label": null, "logBase": 1, @@ -691,7 +842,6 @@ "show": true }, { - "$$hashKey": "object:441", "format": "short", "label": null, "logBase": 1, @@ -705,13 +855,269 @@ "alignLevel": null } }, + { + "datasource": null, + "description": "The workqueue for the Tekton Pipelines Reconcilers", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 38 + }, + "id": 37, + "options": { + "graph": {}, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "7.5.17", + "targets": [ + { + "exemplar": true, + "expr": "avg(tekton_pipelines_controller_workqueue_depth)", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Workqueue Depth", + "type": "timeseries" + }, + { + "datasource": null, + "description": "Total number of Kubernetes API requests by Tekton Controller", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 38 + }, + "id": 39, + "options": { + "graph": {}, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "7.5.17", + "targets": [ + { + "exemplar": true, + "expr": "rate(tekton_pipelines_controller_client_results{prometheus=\"openshift-user-workload-monitoring/user-workload\"}[1h])*3600", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Pipelines Client Results", + "type": "timeseries" + }, + { + "datasource": null, + "description": "Scheduling latency for the Taskruns pods", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 46 + }, + "id": 41, + "options": { + "graph": {}, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "7.5.17", + "targets": [ + { + "exemplar": true, + "expr": "avg(tekton_pipelines_controller_taskruns_pod_latency)", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Taskruns Pod Latency", + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 54 + }, + "id": 17, + "panels": [], + "repeat": "datasource", + "scopedVars": { + "datasource": { + "selected": true, + "text": "prometheus-appstudio-ds", + "value": "prometheus-appstudio-ds" + } + }, + "title": "Pipelines-as-code Monitoring", + "type": "row" + }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": null, - "description": "The workqueue depth for the Pipelines-as-code Reconcilers", + "description": "Total average CPU usage of the Pipelines-as-code watcher and controller pods over the past hour.", "fieldConfig": { "defaults": {}, "overrides": [] @@ -721,16 +1127,17 @@ "gridPos": { "h": 8, "w": 12, - "x": 12, - "y": 27 + "x": 0, + "y": 55 }, "hiddenSeries": false, - "id": 25, + "id": 21, "legend": { "avg": false, "current": false, "max": false, "min": false, + "rightSide": true, "show": true, "total": false, "values": false @@ -746,6 +1153,13 @@ "pointradius": 2, "points": false, "renderer": "flot", + "scopedVars": { + "datasource": { + "selected": true, + "text": "prometheus-appstudio-ds", + "value": "prometheus-appstudio-ds" + } + }, "seriesOverrides": [], "spaceLength": 10, "stack": false, @@ -753,7 +1167,7 @@ "targets": [ { "exemplar": true, - "expr": "avg(pac_watcher_workqueue_depth)", + "expr": "sum by (pod) (rate(container_cpu_usage_seconds_total{pod=~\"pipelines-as-code-(watcher|controller)-.*\"}[5m]))*100", "interval": "", "legendFormat": "", "refId": "A" @@ -763,7 +1177,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Watcher Workqueue Depth", + "title": "CPU Usage", "tooltip": { "shared": true, "sort": 0, @@ -779,7 +1193,7 @@ }, "yaxes": [ { - "$$hashKey": "object:351", + "$$hashKey": "object:171", "format": "short", "label": null, "logBase": 1, @@ -788,7 +1202,7 @@ "show": true }, { - "$$hashKey": "object:352", + "$$hashKey": "object:172", "format": "short", "label": null, "logBase": 1, @@ -808,7 +1222,7 @@ "dashLength": 10, "dashes": false, "datasource": null, - "description": "Total number of kubernetes api request for pipelines-as-code watcher", + "description": "Working set memory usage of Pipelines-as-code watcher and controller pods over the past hour.", "fieldConfig": { "defaults": {}, "overrides": [] @@ -818,16 +1232,17 @@ "gridPos": { "h": 8, "w": 12, - "x": 0, - "y": 35 + "x": 12, + "y": 55 }, "hiddenSeries": false, - "id": 23, + "id": 19, "legend": { "avg": false, "current": false, "max": false, "min": false, + "rightSide": true, "show": true, "total": false, "values": false @@ -843,6 +1258,13 @@ "pointradius": 2, "points": false, "renderer": "flot", + "scopedVars": { + "datasource": { + "selected": true, + "text": "prometheus-appstudio-ds", + "value": "prometheus-appstudio-ds" + } + }, "seriesOverrides": [], "spaceLength": 10, "stack": false, @@ -850,7 +1272,7 @@ "targets": [ { "exemplar": true, - "expr": "rate(pac_watcher_client_results{prometheus=\"openshift-user-workload-monitoring/user-workload\"}[1h])*3600", + "expr": "sum by (pod) (container_memory_working_set_bytes{pod=~\"pipelines-as-code-(watcher|controller)-.*\"})", "interval": "", "legendFormat": "", "refId": "A" @@ -860,7 +1282,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Watcher Client Results", + "title": "Memory Utilization", "tooltip": { "shared": true, "sort": 0, @@ -876,7 +1298,7 @@ }, "yaxes": [ { - "$$hashKey": "object:262", + "$$hashKey": "object:80", "format": "short", "label": null, "logBase": 1, @@ -885,7 +1307,7 @@ "show": true }, { - "$$hashKey": "object:263", + "$$hashKey": "object:81", "format": "short", "label": null, "logBase": 1, @@ -900,17 +1322,47 @@ } }, { - "collapsed": false, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, "datasource": null, + "description": "Number of Pipelineruns watched on pull_request and push events", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, "gridPos": { - "h": 1, - "w": 24, + "h": 8, + "w": 12, "x": 0, - "y": 43 + "y": 63 }, - "id": 11, - "panels": [], - "repeat": "datasource", + "hiddenSeries": false, + "id": 27, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", "scopedVars": { "datasource": { "selected": true, @@ -918,8 +1370,61 @@ "value": "prometheus-appstudio-ds" } }, - "title": "Pipelines-Controller Resource Utlization", - "type": "row" + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum by (event_type) (rate(pac_watcher_pipelines_as_code_pipelinerun_count{prometheus=\"openshift-user-workload-monitoring/user-workload\"}[1h]))*3600", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "PipelineRuns Watched Per Hour", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:440", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:441", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { "aliasColors": {}, @@ -927,7 +1432,7 @@ "dashLength": 10, "dashes": false, "datasource": null, - "description": "Total average CPU usage of the Pipelines over the past hour.", + "description": "The workqueue depth for the Pipelines-as-code Reconcilers", "fieldConfig": { "defaults": {}, "overrides": [] @@ -937,11 +1442,11 @@ "gridPos": { "h": 8, "w": 12, - "x": 0, - "y": 44 + "x": 12, + "y": 63 }, "hiddenSeries": false, - "id": 13, + "id": 25, "legend": { "avg": false, "current": false, @@ -976,8 +1481,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"tekton-pipelines-controller-.*\"}[5m]))", - "format": "table", + "expr": "avg(pac_watcher_workqueue_depth)", "interval": "", "legendFormat": "", "refId": "A" @@ -987,7 +1491,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "CPU Usage", + "title": "Watcher Workqueue Depth", "tooltip": { "shared": true, "sort": 0, @@ -1003,6 +1507,7 @@ }, "yaxes": [ { + "$$hashKey": "object:351", "format": "short", "label": null, "logBase": 1, @@ -1011,6 +1516,7 @@ "show": true }, { + "$$hashKey": "object:352", "format": "short", "label": null, "logBase": 1, @@ -1030,7 +1536,7 @@ "dashLength": 10, "dashes": false, "datasource": null, - "description": "Working set memory usage of Pipelines Controller Containers ", + "description": "Total number of kubernetes api request for pipelines-as-code watcher", "fieldConfig": { "defaults": {}, "overrides": [] @@ -1040,11 +1546,11 @@ "gridPos": { "h": 8, "w": 12, - "x": 12, - "y": 44 + "x": 0, + "y": 71 }, "hiddenSeries": false, - "id": 15, + "id": 23, "legend": { "avg": false, "current": false, @@ -1079,7 +1585,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(container_memory_working_set_bytes{pod=~\"tekton-pipelines-controller-.*\"})", + "expr": "rate(pac_watcher_client_results{prometheus=\"openshift-user-workload-monitoring/user-workload\"}[1h])*3600", "interval": "", "legendFormat": "", "refId": "A" @@ -1089,7 +1595,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Memory Utilization", + "title": "Watcher Client Results", "tooltip": { "shared": true, "sort": 0, @@ -1105,6 +1611,7 @@ }, "yaxes": [ { + "$$hashKey": "object:262", "format": "short", "label": null, "logBase": 1, @@ -1113,6 +1620,7 @@ "show": true }, { + "$$hashKey": "object:263", "format": "short", "label": null, "logBase": 1, @@ -1164,5 +1672,5 @@ "timezone": "", "title": "Pipeline Service", "uid": "02ebfdefeeed166624895c36b0c1af4ed3006c5d", - "version": 3 + "version": 4 } \ No newline at end of file