Skip to content

Commit

Permalink
improve monitoring
Browse files Browse the repository at this point in the history
  • Loading branch information
kondratyevd committed Feb 13, 2025
1 parent 0645653 commit 1e12681
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 17 deletions.
29 changes: 15 additions & 14 deletions helm/supersonic/dashboards/default.json
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@
},
"disableTextWrap": false,
"editorMode": "code",
"expr": "sum by(release)(envoy_cluster_membership_total{envoy_cluster_name=\"triton_grpc_service\", release=~\"${release_name}\"})",
"expr": "sum by(release)(envoy_cluster_membership_total{envoy_cluster_name=\"triton_grpc_service\", release=~\"${release_name}\", namespace=~\"${namespace}\"})",
"fullMetaSearch": false,
"includeNullMetadata": true,
"instant": false,
Expand Down Expand Up @@ -328,7 +328,7 @@
"uid": "prometheus"
},
"editorMode": "code",
"expr": "sum by (release) ( rate(nv_inference_compute_infer_duration_us{release=~\"${release_name}\"}[15s])) /sum by (release) ( (rate(nv_inference_exec_count{release=~\"${release_name}\"}[15s]) * 1000) + 0.001)",
"expr": "sum (rate(nv_inference_compute_infer_duration_us{release=~\"${release_name}\", namespace=~\"${namespace}\"}[15s])) / sum ((rate(nv_inference_exec_count{release=~\"${release_name}\", namespace=~\"${namespace}\"}[15s]) * 1000))",
"instant": false,
"legendFormat": "Inference",
"range": true,
Expand All @@ -340,7 +340,7 @@
"uid": "prometheus"
},
"editorMode": "code",
"expr": "sum by (release) ( rate(nv_inference_queue_duration_us{release=~\"${release_name}\"}[15s])) /sum by (release) ( (rate(nv_inference_exec_count{release=~\"${release_name}\"}[15s]) * 1000) + 0.001)",
"expr": "sum (rate(nv_inference_queue_duration_us{release=~\"${release_name}\", namespace=~\"${namespace}\"}[15s])) / sum ((rate(nv_inference_exec_count{release=~\"${release_name}\", namespace=~\"${namespace}\"}[15s]) * 1000))",
"hide": false,
"instant": false,
"legendFormat": "Queue",
Expand All @@ -353,7 +353,7 @@
"uid": "prometheus"
},
"editorMode": "code",
"expr": "sum by (release) ( rate(nv_inference_compute_input_duration_us{release=~\"${release_name}\"}[15s])) /sum by (release) ( (rate(nv_inference_exec_count{release=~\"${release_name}\"}[15s]) * 1000) + 0.001)",
"expr": "sum (rate(nv_inference_compute_input_duration_us{release=~\"${release_name}\", namespace=~\"${namespace}\"}[15s])) / sum ((rate(nv_inference_exec_count{release=~\"${release_name}\", namespace=~\"${namespace}\"}[15s]) * 1000))",
"hide": false,
"instant": false,
"legendFormat": "Input",
Expand All @@ -366,7 +366,7 @@
"uid": "prometheus"
},
"editorMode": "code",
"expr": "sum by (release) ( rate(nv_inference_compute_output_duration_us{release=~\"${release_name}\"}[15s])) /sum by (release) ( (rate(nv_inference_exec_count{release=~\"${release_name}\"}[15s]) * 1000) + 0.001)",
"expr": "sum (rate(nv_inference_compute_output_duration_us{release=~\"${release_name}\", namespace=~\"${namespace}\"}[15s])) / sum ((rate(nv_inference_exec_count{release=~\"${release_name}\", namespace=~\"${namespace}\"}[15s]) * 1000))",
"hide": false,
"instant": false,
"legendFormat": "Output",
Expand All @@ -379,7 +379,7 @@
"uid": "prometheus"
},
"editorMode": "code",
"expr": " sum(\n rate(envoy_http_downstream_rq_time_sum{envoy_http_conn_manager_prefix=\"ingress_grpc\", release=~\"${release_name}\"}[15s])\n /\n rate(envoy_http_downstream_rq_time_count{envoy_http_conn_manager_prefix=\"ingress_grpc\", release=~\"${release_name}\"}[15s])\n ) by (release)",
"expr": "sum(rate(envoy_http_downstream_rq_time_sum{envoy_http_conn_manager_prefix=\"ingress_grpc\", release=~\"${release_name}\", namespace=~\"${namespace}\"}[15s])) / sum(rate(envoy_http_downstream_rq_time_count{envoy_http_conn_manager_prefix=\"ingress_grpc\", release=~\"${release_name}\", namespace=~\"${namespace}\"}[15s]))",
"hide": false,
"instant": false,
"legendFormat": "Total (measured at proxy)",
Expand Down Expand Up @@ -477,7 +477,7 @@
},
"disableTextWrap": false,
"editorMode": "code",
"expr": "sum by(release)(envoy_cluster_membership_total{envoy_cluster_name=\"triton_grpc_service\", release=~\"${release_name}\"})",
"expr": "sum by(release)(envoy_cluster_membership_total{envoy_cluster_name=\"triton_grpc_service\", release=~\"${release_name}\", namespace=~\"${namespace}\"})",
"fullMetaSearch": false,
"includeNullMetadata": true,
"instant": false,
Expand Down Expand Up @@ -600,7 +600,7 @@
"uid": "prometheus"
},
"editorMode": "code",
"expr": "nv_gpu_utilization",
"expr": "nv_gpu_utilization{namespace=~\"${namespace}\"}",
"hide": false,
"instant": false,
"legendFormat": "{{pod}}",
Expand Down Expand Up @@ -708,7 +708,7 @@
"uid": "prometheus"
},
"editorMode": "code",
"expr": "nv_gpu_power_usage / nv_gpu_power_limit",
"expr": "(nv_gpu_power_usage{namespace=~\"${namespace}\"} / nv_gpu_power_limit{namespace=~\"${namespace}\"})",
"hide": false,
"instant": false,
"legendFormat": "{{pod}}",
Expand Down Expand Up @@ -762,6 +762,7 @@
}
},
"mappings": [],
"max": 1,
"min": 0,
"thresholds": {
"mode": "absolute",
Expand All @@ -784,7 +785,7 @@
}
]
},
"unit": "bytes"
"unit": "percentunit"
},
"overrides": []
},
Expand Down Expand Up @@ -815,7 +816,7 @@
"uid": "prometheus"
},
"editorMode": "code",
"expr": "nv_gpu_memory_used_bytes",
"expr": "nv_gpu_memory_used_bytes{namespace=~\"${namespace}\"}",
"hide": false,
"instant": false,
"legendFormat": "{{pod}}",
Expand Down Expand Up @@ -925,7 +926,7 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "sum (rate(nv_inference_count{release=~\"${release_name}\"}[15s]))by(release)",
"expr": "sum (rate(nv_inference_count{release=~\"${release_name}\", namespace=~\"${namespace}\"}[15s])) by(release)",
"instant": false,
"interval": "",
"legendFormat": "{{ release }}",
Expand Down Expand Up @@ -1022,7 +1023,7 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "sum (rate(nv_inference_count{release=~\"${release_name}\"}[15s]))by(pod)",
"expr": "sum (rate(nv_inference_count{release=~\"${release_name}\", namespace=~\"${namespace}\"}[15s])) by(pod)",
"instant": false,
"interval": "",
"legendFormat": "{{ pod }}",
Expand Down Expand Up @@ -1119,7 +1120,7 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "sum (rate(nv_inference_count{release=~\"${release_name}\"}[15s]))by(model)",
"expr": "sum (rate(nv_inference_count{release=~\"${release_name}\", namespace=~\"${namespace}\"}[15s])) by(model)",
"instant": false,
"interval": "",
"legendFormat": "{{ model }}",
Expand Down
12 changes: 10 additions & 2 deletions helm/supersonic/dashboards/variables.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
"type": "prometheus",
"uid": "prometheus"
},
"definition": "label_values(release)",
"definition": "label_values({app=\"supersonic\",namespace=\"%NAMESPACE%\"},release)",
"hide": 0,
"includeAll": true,
"label": "Release",
Expand All @@ -23,7 +23,7 @@
"options": [],
"query": {
"qryType": 1,
"query": "label_values(release)",
"query": "label_values({app=\"supersonic\",namespace=\"%NAMESPACE%\"},release)",
"refId": "PrometheusVariableQueryEditor-VariableQuery"
},
"refresh": 1,
Expand Down Expand Up @@ -63,6 +63,14 @@
"query": "%PROMETHEUS_URL_FULL%",
"skipUrlSync": false,
"type": "constant"
},
{
"hide": 2,
"label": "Namespace",
"name": "namespace",
"query": "%NAMESPACE%",
"skipUrlSync": false,
"type": "constant"
}
]
}
Expand Down
1 change: 1 addition & 0 deletions helm/supersonic/templates/default-dashboard.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ data:
{{- /* Replace template variables with actual values */ -}}
{{- $dashboard := $dashboard | replace "%RELEASE_NAME%" .Release.Name }}
{{- $dashboard := $dashboard | replace "%CHART_VERSION%" .Chart.Version }}
{{- $dashboard := $dashboard | replace "%NAMESPACE%" .Release.Namespace }}
{{- $dashboard := $dashboard | replace "%SERVER_LOAD_METRIC%" $metric }}
{{- $dashboard := $dashboard | replace "%SERVER_LOAD_THRESHOLD%" $threshold }}
{{- $dashboard := $dashboard | replace "%PROMETHEUS_URL_FULL%" $prometheus_url }}
Expand Down
10 changes: 9 additions & 1 deletion helm/supersonic/templates/servicemonitor.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ spec:
targetLabel: release
regex: "(.*)"
replacement: "{{ include "supersonic.name" . }}"
- sourceLabels: [__address__]
targetLabel: app
regex: "(.*)"
replacement: "{{ .Chart.Name }}"

---

Expand All @@ -48,4 +52,8 @@ spec:
- sourceLabels: [__address__]
targetLabel: release
regex: "(.*)"
replacement: "{{ include "supersonic.name" . }}"
replacement: "{{ include "supersonic.name" . }}"
- sourceLabels: [__address__]
targetLabel: app
regex: "(.*)"
replacement: "{{ .Chart.Name }}"

0 comments on commit 1e12681

Please sign in to comment.