diff --git a/common/monitoring/grafana/dashboards/gpu-mterics-dashboard.json b/common/monitoring/grafana/dashboards/gpu-mterics-dashboard.json new file mode 100644 index 0000000..6774632 --- /dev/null +++ b/common/monitoring/grafana/dashboards/gpu-mterics-dashboard.json @@ -0,0 +1,530 @@ +{ + "annotations": { + "list": [ + { + "$$hashKey": "object:192", + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "This dashboard is to display the metrics from DCGM Exporter on a Kubernetes (1.19+) cluster", + "editable": true, + "fiscalYearStartMonth": 0, + "gnetId": 12239, + "graphTooltip": 0, + "id": 11, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 34, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "avg by(UUID) (DCGM_FI_DEV_GPU_TEMP{UUID=\"$GPU\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Panel Title", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 90, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "celsius", + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 7, + "x": 0, + "y": 8 + }, + "id": 22, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "10.3.4", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "avg(DCGM_FI_DEV_GPU_TEMP{UUID=\"$GPU\"})", + "instant": false, + "legendFormat": "{{namespace}} : {{container}}: {{DCGM_FI_DEV_GPU_TEMP}}", + "range": true, + "refId": "A" + } + ], + "title": "Temperature", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "decimals": 1, + "fieldMinMax": false, + "mappings": [], + "min": -2, + "unit": "decmbytes", + "unitScale": false + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "avg(DCGM_FI_DEV_FB_FREE{UUID=\"GPU-8e7e703c-1d10-e2f8-0f58-192fefcedba8\", instance=~\"metrics\\\\.k8s\\\\.bhasai\\\\.samagra\\\\.io:80\"})" + }, + "properties": [ + { + "id": "displayName", + "value": "FREE" + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 5, + "x": 7, + "y": 8 + }, + "id": 29, + "options": { + "displayLabels": [ + "value", + "percent" + ], + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true, + "values": [ + "value" + ] + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "exemplar": false, + "expr": "avg(DCGM_FI_DEV_FB_USED{UUID=\"$GPU\"})", + "instant": false, + "interval": "", + "legendFormat": "USED", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "exemplar": false, + "expr": "avg(DCGM_FI_DEV_FB_FREE{UUID=\"$GPU\"})", + "hide": false, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "B" + } + ], + "title": "Memory Metrics", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [], + "unit": "watt", + "unitScale": true + }, + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "bhasai-dev : bge-small-gpu-embeddings: ", + "bhasai-dev : mini-lm-embed: ", + "bhasai-dev : ner: ", + "bhasai-dev : pdf-parser-celery-worker: ", + "bhasai-dev : text-translation-azure-dict: " + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 7, + "x": 12, + "y": 8 + }, + "id": 31, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "DCGM_FI_DEV_POWER_USAGE{UUID=\"$GPU\"}", + "instant": false, + "legendFormat": "{{namespace}} : {{container}}: {{DCGM_FI_DEV_POWER_USAGE}} ", + "range": true, + "refId": "A" + } + ], + "title": "Power Usage", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "watt", + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 7, + "x": 0, + "y": 14 + }, + "id": 19, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "10.3.4", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "avg(DCGM_FI_DEV_POWER_USAGE{UUID=\"$GPU\"})", + "instant": false, + "legendFormat": "{{namespace}} : {{container}}: {{DCGM_FI_DEV_POWER_USAGE}} ", + "range": true, + "refId": "A" + } + ], + "title": "Power Usage", + "type": "gauge" + } + ], + "refresh": "", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "GPU-666b9a39-fb25-7ab5-3119-4e4d5db26455", + "value": "GPU-666b9a39-fb25-7ab5-3119-4e4d5db26455" + }, + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "definition": "label_values(DCGM_FI_DEV_ENC_UTIL,UUID)", + "hide": 1, + "includeAll": false, + "label": "GPU", + "multi": false, + "name": "GPU", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(DCGM_FI_DEV_ENC_UTIL,UUID)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timeRangeUpdatedDuringEditOrView": false, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "GPU Metrics Dashboard", + "uid": "Oxed_c6W1", + "version": 11, + "weekStart": "" + } \ No newline at end of file