diff --git a/grafana/json-models/standalone.json b/grafana/json-models/standalone.json index 3b315365..d30f60b3 100644 --- a/grafana/json-models/standalone.json +++ b/grafana/json-models/standalone.json @@ -59,8 +59,8 @@ "overrides": [] }, "gridPos": { - "h": 5, - "w": 8, + "h": 6, + "w": 6, "x": 0, "y": 1 }, @@ -100,6 +100,73 @@ "title": "Total Number of GPUs", "type": "stat" }, + { + "datasource": { + "type": "prometheus", + "uid": "${source}" + }, + "description": "Number of GPUs with a utilization higher than 1%.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 1 + }, + "id": 107, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.2.0-73179", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${source}" + }, + "editorMode": "code", + "expr": "count(rocm_utilization_percentage > 1)", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Active GPUs", + "type": "stat" + }, { "datasource": { "type": "prometheus", @@ -132,9 +199,9 @@ "overrides": [] }, "gridPos": { - "h": 5, - "w": 8, - "x": 8, + "h": 6, + "w": 6, + "x": 12, "y": 1 }, "id": 30, @@ -205,9 +272,9 @@ "overrides": [] }, "gridPos": { - "h": 5, - "w": 8, - "x": 16, + "h": 6, + "w": 6, + "x": 18, "y": 1 }, "id": 32, @@ -252,9 +319,9 @@ "h": 1, "w": 24, "x": 0, - "y": 6 + "y": 7 }, - "id": 9, + "id": 104, "panels": [ { "datasource": { @@ -366,9 +433,23 @@ "refId": "B" } ], - "title": "Cluster Utilization (%)", + "title": "Average GPU Utilization (%)", "type": "timeseries" - }, + } + ], + "title": "GPU Usage - Global", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 103, + "panels": [ { "datasource": { "type": "prometheus", @@ -393,7 +474,7 @@ "h": 9, "w": 24, "x": 0, - "y": 42 + "y": 146 }, "id": 6, "options": { @@ -465,7 +546,7 @@ "type": "heatmap" } ], - "title": "GPU - Utilization", + "title": "GPU Usage - Nodes", "type": "row" }, { @@ -474,7 +555,130 @@ "h": 1, "w": 24, "x": 0, - "y": 7 + "y": 9 + }, + "id": 9, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${source}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 100, + "axisSoftMin": 0, + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 147 + }, + "id": 102, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "maxHeight": 600, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.2.0-73179", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${source}" + }, + "editorMode": "code", + "expr": "rocm_utilization_percentage", + "instant": false, + "legendFormat": "{{instance}}, {{card}}", + "range": true, + "refId": "A" + } + ], + "title": "Cluster Utilization (%)", + "transformations": [ + { + "id": "renameByRegex", + "options": { + "regex": "(.*):(.*), (.*)", + "renamePattern": "Note: $1 ($3)" + } + } + ], + "type": "timeseries" + } + ], + "title": "GPU Usage - GPUs", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 10 }, "id": 40, "panels": [ @@ -550,7 +754,7 @@ "h": 9, "w": 24, "x": 0, - "y": 8 + "y": 10 }, "id": 29, "options": { @@ -596,7 +800,7 @@ "type": "timeseries" } ], - "title": "GPU - Temperature", + "title": "GPU Temperature - GPUs", "type": "row" }, { @@ -605,7 +809,130 @@ "h": 1, "w": 24, "x": 0, - "y": 8 + "y": 11 + }, + "id": 105, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${source}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 100, + "axisSoftMin": 0, + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + }, + "unit": "watt" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 12 + }, + "id": 106, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Name", + "sortDesc": false + }, + "tooltip": { + "maxHeight": 600, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.2.0-73179", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${source}" + }, + "editorMode": "code", + "expr": "avg by (instance) (rocm_average_socket_power_watts) ", + "instant": false, + "legendFormat": "{{instance}}", + "range": true, + "refId": "A" + } + ], + "title": "GPU Power", + "transformations": [ + { + "id": "renameByRegex", + "options": { + "regex": "(.*):(.*)", + "renamePattern": "Node: $1" + } + } + ], + "type": "timeseries" + } + ], + "title": "GPU Power - Nodes", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 12 }, "id": 100, "panels": [ @@ -673,7 +1000,7 @@ "h": 9, "w": 24, "x": 0, - "y": 297 + "y": 11 }, "id": 101, "options": { @@ -719,7 +1046,7 @@ "type": "timeseries" } ], - "title": "GPU - Power", + "title": "GPU Power - GPUs", "type": "row" }, { @@ -728,7 +1055,7 @@ "h": 1, "w": 24, "x": 0, - "y": 9 + "y": 13 }, "id": 36, "panels": [ @@ -797,7 +1124,7 @@ "h": 8, "w": 24, "x": 0, - "y": 214 + "y": 57 }, "id": 35, "options": { @@ -841,7 +1168,7 @@ "type": "timeseries" } ], - "title": "CPU - Load Average", + "title": "CPU Usage - Nodes", "type": "row" }, { @@ -850,7 +1177,7 @@ "h": 1, "w": 24, "x": 0, - "y": 10 + "y": 14 }, "id": 67, "panels": [ @@ -920,7 +1247,7 @@ "h": 8, "w": 24, "x": 0, - "y": 308 + "y": 14 }, "id": 92, "options": { @@ -944,7 +1271,7 @@ "uid": "${source}" }, "editorMode": "code", - "expr": "sum((sum by (instance) (rate(node_infiniband_port_data_received_bytes_total[$__rate_interval]))))", + "expr": "sum(rate(node_infiniband_port_data_received_bytes_total[$__rate_interval]))", "instant": false, "legendFormat": "Received", "range": true, @@ -956,7 +1283,7 @@ "uid": "${source}" }, "editorMode": "code", - "expr": "sum((sum by (instance) (rate(node_infiniband_port_data_transmitted_bytes_total[$__rate_interval]))))", + "expr": "sum(rate(node_infiniband_port_data_transmitted_bytes_total[$__rate_interval]))", "hide": false, "instant": false, "legendFormat": "Transmitted", @@ -977,7 +1304,7 @@ "type": "timeseries" } ], - "title": "Network - Bandwidth", + "title": "Network Bandwidth - Global", "type": "row" } ], @@ -992,20 +1319,18 @@ { "hide": 2, "includeAll": false, - "multi": false, "name": "source", "options": [], "query": "prometheus", "refresh": 1, "regex": "", - "skipUrlSync": false, "type": "datasource" }, { "hide": 2, "name": "node_exporter_port", "query": "9100", - "skipUrlSync": false, + "skipUrlSync": true, "type": "constant" } ]