From d7818dd9e3515edfb95d4919c8e4f3de546fcff9 Mon Sep 17 00:00:00 2001 From: kondratyevd Date: Tue, 28 Jan 2025 23:16:00 -0500 Subject: [PATCH] update dashboard and server load metric --- helm/supersonic/dashboards/default.json | 174 +++++++++++++++++++++++- helm/supersonic/templates/_helpers.tpl | 8 +- 2 files changed, 176 insertions(+), 6 deletions(-) diff --git a/helm/supersonic/dashboards/default.json b/helm/supersonic/dashboards/default.json index 00da1f2..b55b646 100644 --- a/helm/supersonic/dashboards/default.json +++ b/helm/supersonic/dashboards/default.json @@ -15,7 +15,7 @@ "description": "", "gridPos": { "h": 4, - "w": 10, + "w": 24, "x": 0, "y": 0 }, @@ -32,6 +32,77 @@ "transparent": true, "type": "text" }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 4 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum by(job)(envoy_cluster_membership_total{envoy_cluster_name=\"triton_grpc_service\", job=\"%RELEASE_NAME%\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{job}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Number of Triton Servers", + "type": "stat" + }, { "datasource": { "type": "prometheus", @@ -83,7 +154,7 @@ "gridPos": { "h": 10, "w": 10, - "x": 0, + "x": 4, "y": 4 }, "id": 2, @@ -112,6 +183,105 @@ ], "title": "Server Load Metric", "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum by(job)(envoy_cluster_membership_total{envoy_cluster_name=\"triton_grpc_service\", job=\"%RELEASE_NAME%\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{job}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Number of Triton Servers", + "type": "timeseries" } ], "refresh": "5s", diff --git a/helm/supersonic/templates/_helpers.tpl b/helm/supersonic/templates/_helpers.tpl index a2c69e8..71400f5 100644 --- a/helm/supersonic/templates/_helpers.tpl +++ b/helm/supersonic/templates/_helpers.tpl @@ -27,12 +27,12 @@ {{- if not ( eq .Values.prometheus.serverLoadMetric "" ) }} {{- printf "%s" .Values.prometheus.serverLoadMetric -}} {{- else }} -sum by (instance) ( - nv_inference_queue_duration_us{instance=~"{{ include "supersonic.name" . }}"} +sum by (job) ( + nv_inference_queue_duration_us{job=~"{{ include "supersonic.tritonName" . }}"} ) / -sum by (instance) ( - (nv_inference_exec_count{instance=~"{{ include "supersonic.name" . }}"} * 1000) + 0.001 +sum by (job) ( + (nv_inference_exec_count{job=~"{{ include "supersonic.tritonName" . }}"} * 1000) + 0.001 ) {{- end }} {{- end }} \ No newline at end of file