diff --git a/docker/grafana/json-models/index.json b/docker/grafana/json-models/index.json index f92adc2d..174a8ba2 100644 --- a/docker/grafana/json-models/index.json +++ b/docker/grafana/json-models/index.json @@ -164,7 +164,7 @@ "uid" : "prometheus" }, "editorMode" : "code", - "expr" : "timestamp(group by (jobid,user,batchflag,partition,nodes) (slurmjob_info{jobid=~\"^\\\\d+\"}))", + "expr" : "timestamp(group by (jobid,user,batchflag,partition,nodes) (rmsjob_info{jobid=~\"^\\\\d+\"}))", "format" : "table", "hide" : false, "instant" : false, diff --git a/docker/grafana/json-models/job.json b/docker/grafana/json-models/job.json index fdb065e5..79fe7070 100644 --- a/docker/grafana/json-models/job.json +++ b/docker/grafana/json-models/job.json @@ -187,7 +187,7 @@ }, "editorMode" : "code", "exemplar" : false, - "expr" : "(slurmjob_info{jobid=\"$jobid\"})", + "expr" : "(rmsjob_info{jobid=\"$jobid\"})", "instant" : false, "legendFormat" : "{{batchflag}}", "range" : true, @@ -270,7 +270,7 @@ "uid" : "$source" }, "editorMode" : "code", - "expr" : "slurmjob_info{jobid=\"$jobid\"}", + "expr" : "rmsjob_info{jobid=\"$jobid\"}", "instant" : false, "legendFormat" : "__auto", "range" : true, @@ -362,7 +362,7 @@ "uid" : "$source" }, "editorMode" : "code", - "expr" : "avg(avg(label_replace({__name__=~\"card(.*)_rocm_utilization\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_utilization\") * on (instance) group_left() slurmjob_info{jobid=\"$jobid\"}) by (card))", + "expr" : "avg(avg(label_replace({__name__=~\"card(.*)_rocm_utilization\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_utilization\") * on (instance) group_left() rmsjob_info{jobid=\"$jobid\"}) by (card))", "instant" : false, "legendFormat" : "GPU Core", "range" : true, @@ -374,7 +374,7 @@ "uid" : "$source" }, "editorMode" : "code", - "expr" : "avg(100 * avg(label_replace({__name__=~\"card(.*)_rocm_vram_used\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_vram_used\") * on (instance) group_left() slurmjob_info{jobid=\"$jobid\"}) by (card) / avg(label_replace({__name__=~\"card(.*)_rocm_vram_total\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_vram_total\") * on (instance) group_left() slurmjob_info{jobid=\"$jobid\"}) by (card))", + "expr" : "avg(100 * avg(label_replace({__name__=~\"card(.*)_rocm_vram_used\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_vram_used\") * on (instance) group_left() rmsjob_info{jobid=\"$jobid\"}) by (card) / avg(label_replace({__name__=~\"card(.*)_rocm_vram_total\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_vram_total\") * on (instance) group_left() rmsjob_info{jobid=\"$jobid\"}) by (card))", "hide" : false, "instant" : false, "legendFormat" : "GPU Memory", @@ -445,7 +445,7 @@ "disableTextWrap" : false, "editorMode" : "builder", "exemplar" : false, - "expr" : "slurmjob_info{jobid=\"$jobid\"}", + "expr" : "rmsjob_info{jobid=\"$jobid\"}", "fullMetaSearch" : false, "includeNullMetadata" : true, "instant" : false, @@ -534,7 +534,7 @@ }, "editorMode" : "code", "exemplar" : false, - "expr" : "slurmjob_info{jobid=\"$jobid\"}", + "expr" : "rmsjob_info{jobid=\"$jobid\"}", "instant" : false, "legendFormat" : "__auto", "range" : true, @@ -620,7 +620,7 @@ "uid" : "$source" }, "editorMode" : "code", - "expr" : "max(timestamp(slurmjob_info{jobid=\"$jobid\"}))", + "expr" : "max(timestamp(rmsjob_info{jobid=\"$jobid\"}))", "hide" : false, "instant" : false, "interval" : "", @@ -711,7 +711,7 @@ }, "editorMode" : "code", "exemplar" : false, - "expr" : "avg(label_replace({__name__=~\"card(.*)_rocm_utilization\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_utilization\") * on (instance) group_left() slurmjob_info{jobid=\"$jobid\"}) by (card)", + "expr" : "avg(label_replace({__name__=~\"card(.*)_rocm_utilization\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_utilization\") * on (instance) group_left() rmsjob_info{jobid=\"$jobid\"}) by (card)", "instant" : false, "legendFormat" : "Card: {{card}}", "range" : true, @@ -810,7 +810,7 @@ }, "editorMode" : "code", "exemplar" : false, - "expr" : "avg(label_replace({__name__=~\"card(.*)_rocm_utilization\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_utilization\") * on (instance) group_left() slurmjob_info{jobid=\"$jobid\"}) by (card)", + "expr" : "avg(label_replace({__name__=~\"card(.*)_rocm_utilization\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_utilization\") * on (instance) group_left() rmsjob_info{jobid=\"$jobid\"}) by (card)", "format" : "time_series", "instant" : false, "interval" : "", @@ -888,7 +888,7 @@ }, "editorMode" : "code", "exemplar" : false, - "expr" : "100 * max(label_replace({__name__=~\"card(.*)_rocm_vram_used\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_vram_used\") * on (instance) group_left() slurmjob_info{jobid=\"$jobid\"}) by (card) / max(label_replace({__name__=~\"card(.*)_rocm_vram_total\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_vram_total\") * on (instance) group_left() slurmjob_info{jobid=\"$jobid\"}) by (card)", + "expr" : "100 * max(label_replace({__name__=~\"card(.*)_rocm_vram_used\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_vram_used\") * on (instance) group_left() rmsjob_info{jobid=\"$jobid\"}) by (card) / max(label_replace({__name__=~\"card(.*)_rocm_vram_total\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_vram_total\") * on (instance) group_left() rmsjob_info{jobid=\"$jobid\"}) by (card)", "instant" : false, "legendFormat" : "Card: {{card}}", "range" : true, @@ -987,7 +987,7 @@ }, "editorMode" : "code", "exemplar" : false, - "expr" : "100 * avg(label_replace({__name__=~\"card(.*)_rocm_vram_used\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_vram_used\") * on (instance) group_left() slurmjob_info{jobid=\"$jobid\"}) by (card) / max(label_replace({__name__=~\"card(.*)_rocm_vram_total\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_vram_total\") * on (instance) group_left() slurmjob_info{jobid=\"$jobid\"}) by (card)", + "expr" : "100 * avg(label_replace({__name__=~\"card(.*)_rocm_vram_used\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_vram_used\") * on (instance) group_left() rmsjob_info{jobid=\"$jobid\"}) by (card) / max(label_replace({__name__=~\"card(.*)_rocm_vram_total\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_vram_total\") * on (instance) group_left() rmsjob_info{jobid=\"$jobid\"}) by (card)", "format" : "time_series", "instant" : false, "interval" : "", @@ -1100,7 +1100,7 @@ "uid" : "$source" }, "editorMode" : "code", - "expr" : "avg(label_replace({__name__=~\"card(.*)_rocm_utilization\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_utilization\") * on (instance) group_left() slurmjob_info{jobid=\"$jobid\"}) by (card)", + "expr" : "avg(label_replace({__name__=~\"card(.*)_rocm_utilization\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_utilization\") * on (instance) group_left() rmsjob_info{jobid=\"$jobid\"}) by (card)", "instant" : false, "legendFormat" : "Card: {{card}}", "range" : true, @@ -1198,7 +1198,7 @@ "uid" : "$source" }, "editorMode" : "code", - "expr" : "avg(label_replace({__name__=~\"card(.*)_rocm_temp_die_edge\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_temp_die_edge\") * on (instance) group_left() slurmjob_info{jobid=\"$jobid\"}) by (card)", + "expr" : "avg(label_replace({__name__=~\"card(.*)_rocm_temp_die_edge\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_temp_die_edge\") * on (instance) group_left() rmsjob_info{jobid=\"$jobid\"}) by (card)", "instant" : false, "legendFormat" : "Card: {{card}}", "range" : true, @@ -1296,7 +1296,7 @@ "uid" : "$source" }, "editorMode" : "code", - "expr" : "avg(label_replace({__name__=~\"card(.*)_rocm_avg_pwr\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_avg_pwr\") * on (instance) group_left() slurmjob_info{jobid=\"$jobid\"}) by (card)", + "expr" : "avg(label_replace({__name__=~\"card(.*)_rocm_avg_pwr\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_avg_pwr\") * on (instance) group_left() rmsjob_info{jobid=\"$jobid\"}) by (card)", "instant" : false, "legendFormat" : "Card: {{card}}", "range" : true, @@ -1394,7 +1394,7 @@ "uid" : "$source" }, "editorMode" : "code", - "expr" : "avg(label_replace({__name__=~\"card(.*)_rocm_sclk_clock_mhz\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_sclk_clock_mhz\") * on (instance) group_left() slurmjob_info{jobid=\"$jobid\"}) by (card)", + "expr" : "avg(label_replace({__name__=~\"card(.*)_rocm_sclk_clock_mhz\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_sclk_clock_mhz\") * on (instance) group_left() rmsjob_info{jobid=\"$jobid\"}) by (card)", "instant" : false, "legendFormat" : "Card: {{card}}", "range" : true, @@ -1492,7 +1492,7 @@ "uid" : "$source" }, "editorMode" : "code", - "expr" : "avg(label_replace({__name__=~\"card(.*)_rocm_mclk_clock_mhz\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_mclk_clock_mhz\") * on (instance) group_left() slurmjob_info{jobid=\"$jobid\"}) by (card)", + "expr" : "avg(label_replace({__name__=~\"card(.*)_rocm_mclk_clock_mhz\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_mclk_clock_mhz\") * on (instance) group_left() rmsjob_info{jobid=\"$jobid\"}) by (card)", "instant" : false, "legendFormat" : "Card: {{card}}", "range" : true, @@ -1711,7 +1711,7 @@ "uid" : "$source" }, "editorMode" : "code", - "expr" : "quantile(0.20, (label_replace({__name__=~\"card(.*)_rocm_utilization\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_utilization\") * on (instance) group_left() slurmjob_info{jobid=\"$jobid\"})) ", + "expr" : "quantile(0.20, (label_replace({__name__=~\"card(.*)_rocm_utilization\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_utilization\") * on (instance) group_left() rmsjob_info{jobid=\"$jobid\"})) ", "instant" : false, "legendFormat" : "Quantile 0.2 ", "range" : true, @@ -1723,7 +1723,7 @@ "uid" : "$source" }, "editorMode" : "code", - "expr" : "quantile(0.5, (label_replace({__name__=~\"card(.*)_rocm_utilization\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_utilization\") * on (instance) group_left() slurmjob_info{jobid=\"$jobid\"})) ", + "expr" : "quantile(0.5, (label_replace({__name__=~\"card(.*)_rocm_utilization\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_utilization\") * on (instance) group_left() rmsjob_info{jobid=\"$jobid\"})) ", "hide" : false, "instant" : false, "legendFormat" : "Median", @@ -1736,7 +1736,7 @@ "uid" : "$source" }, "editorMode" : "code", - "expr" : "quantile(0.8, (label_replace({__name__=~\"card(.*)_rocm_utilization\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_utilization\") * on (instance) group_left() slurmjob_info{jobid=\"$jobid\"})) ", + "expr" : "quantile(0.8, (label_replace({__name__=~\"card(.*)_rocm_utilization\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_utilization\") * on (instance) group_left() rmsjob_info{jobid=\"$jobid\"})) ", "hide" : false, "instant" : false, "legendFormat" : "Quantile 0.8", @@ -1749,7 +1749,7 @@ "uid" : "$source" }, "editorMode" : "code", - "expr" : "avg((label_replace({__name__=~\"card(.*)_rocm_utilization\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_utilization\") * on (instance) group_left() slurmjob_info{jobid=\"$jobid\"}))", + "expr" : "avg((label_replace({__name__=~\"card(.*)_rocm_utilization\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_utilization\") * on (instance) group_left() rmsjob_info{jobid=\"$jobid\"}))", "hide" : false, "instant" : false, "legendFormat" : "Average", @@ -1862,7 +1862,7 @@ "uid" : "$source" }, "editorMode" : "code", - "expr" : "avg(label_replace({__name__=~\"card(.*)_rocm_utilization\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_utilization\") * on (instance) group_left() slurmjob_info{jobid=\"$jobid\"}) by (instance)", + "expr" : "avg(label_replace({__name__=~\"card(.*)_rocm_utilization\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_utilization\") * on (instance) group_left() rmsjob_info{jobid=\"$jobid\"}) by (instance)", "instant" : false, "legendFormat" : "__auto", "range" : true, @@ -2017,7 +2017,7 @@ "uid" : "$source" }, "editorMode" : "code", - "expr" : "label_replace({__name__=~\"card(.*)_rocm_utilization\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_utilization\") * on (instance) group_left() slurmjob_info{jobid=\"$jobid\"}", + "expr" : "label_replace({__name__=~\"card(.*)_rocm_utilization\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_utilization\") * on (instance) group_left() rmsjob_info{jobid=\"$jobid\"}", "instant" : false, "legendFormat" : "__auto", "range" : true, @@ -2225,7 +2225,7 @@ "uid" : "$source" }, "editorMode" : "code", - "expr": "avg by (marker) ((label_replace({__name__=~\"card(.*)_rocm_utilization\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_utilization\")) * on (instance) group_left(jobid,marker) slurmjob_info{jobid=\"$jobid\"} * on (jobid) group_left(marker) count by (jobid,marker) (slurmjob_annotations{jobid=\"$jobid\"} > 0))", + "expr": "avg by (marker) ((label_replace({__name__=~\"card(.*)_rocm_utilization\"}, \"card\", \"$1\", \"__name__\", \"card(.*)_rocm_utilization\")) * on (instance) group_left(jobid,marker) rmsjob_info{jobid=\"$jobid\"} * on (jobid) group_left(marker) count by (jobid,marker) (slurmjob_annotations{jobid=\"$jobid\"} > 0))", "hide" : false, "instant" : false, "legendFormat" : "__auto",