From a9b422c33651f93bc607d4430e40487c58899642 Mon Sep 17 00:00:00 2001
From: Robert Lucian Chiriac <robert.lucian.chiriac@gmail.com>
Date: Wed, 28 Jul 2021 22:25:33 +0300
Subject: [PATCH 1/5] Reduce labels from kube-state-metrics and kubelet
 exporters

---
 manager/manifests/prometheus-kube-state-metrics.yaml | 11 ++++++++++-
 manager/manifests/prometheus-kubelet-exporter.yaml   | 12 ++++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/manager/manifests/prometheus-kube-state-metrics.yaml b/manager/manifests/prometheus-kube-state-metrics.yaml
index 89da6c4842..a42e78a7f2 100644
--- a/manager/manifests/prometheus-kube-state-metrics.yaml
+++ b/manager/manifests/prometheus-kube-state-metrics.yaml
@@ -266,7 +266,16 @@ spec:
           job_status_active\
           )"
       - action: labelkeep
-        regex: (__name__|exported_pod|exported_container|job_name|resource|deployment)
+        regex: (__name__|pod|job_name|resource|deployment)
+      - action: drop
+        regex: (node-exporter|aws-node|fluent-bit|kube-proxy)-(.+)
+        sourceLabels:
+        - pod
+      relabelings:
+      - sourceLabels: 
+        - exported_pod
+        action: replace
+        targetLabel: pod
   namespaceSelector:
     any: true
   selector:
diff --git a/manager/manifests/prometheus-kubelet-exporter.yaml b/manager/manifests/prometheus-kubelet-exporter.yaml
index 87855746a0..ad8f9fe05a 100644
--- a/manager/manifests/prometheus-kubelet-exporter.yaml
+++ b/manager/manifests/prometheus-kubelet-exporter.yaml
@@ -49,6 +49,18 @@ spec:
         )"
     - action: labelkeep
       regex: (__name__|pod|container|name)
+    - action: keep
+      regex: ()
+      sourceLabels:
+      - container
+    - action: keep
+      regex: ()
+      sourceLabels:
+      - name
+    - action: drop
+      regex: (node-exporter|aws-node|fluent-bit|kube-proxy)-(.+)
+      sourceLabels:
+      - pod
     path: /metrics/cadvisor
     port: https-metrics
     relabelings:

From fad52c13945d3ed7efd4cc99e338a88c495914ca Mon Sep 17 00:00:00 2001
From: Robert Lucian Chiriac <robert.lucian.chiriac@gmail.com>
Date: Wed, 28 Jul 2021 22:29:00 +0300
Subject: [PATCH 2/5] Relabel exported_pod to pod

---
 manager/manifests/prometheus-dcgm-exporter.yaml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/manager/manifests/prometheus-dcgm-exporter.yaml b/manager/manifests/prometheus-dcgm-exporter.yaml
index 96d82a5644..f3eb4232f7 100644
--- a/manager/manifests/prometheus-dcgm-exporter.yaml
+++ b/manager/manifests/prometheus-dcgm-exporter.yaml
@@ -126,7 +126,12 @@ spec:
           FB_FREE\
           )"
       - action: labelkeep
-        regex: (__name__|exported_pod)
+        regex: (__name__|pod)
+      relabelings:
+      - sourceLabels: 
+        - exported_pod
+        action: replace
+        targetLabel: pod
   namespaceSelector:
     any: true
   selector:

From 90db24e52234a3aa403a55b09f97ae5bb322f924 Mon Sep 17 00:00:00 2001
From: Robert Lucian Chiriac <robert.lucian.chiriac@gmail.com>
Date: Wed, 28 Jul 2021 22:29:28 +0300
Subject: [PATCH 3/5] Update the Grafana dashboards

---
 .../grafana/grafana-dashboard-async.yaml      | 32 +++++++++----------
 .../grafana/grafana-dashboard-batch.yaml      | 32 +++++++++----------
 .../grafana/grafana-dashboard-cluster.yaml    |  2 +-
 .../grafana/grafana-dashboard-realtime.yaml   | 32 +++++++++----------
 .../grafana/grafana-dashboard-task.yaml       | 32 +++++++++----------
 5 files changed, 65 insertions(+), 65 deletions(-)

diff --git a/manager/manifests/grafana/grafana-dashboard-async.yaml b/manager/manifests/grafana/grafana-dashboard-async.yaml
index a6c45a1186..5bfe81070a 100644
--- a/manager/manifests/grafana/grafana-dashboard-async.yaml
+++ b/manager/manifests/grafana/grafana-dashboard-async.yaml
@@ -1086,7 +1086,7 @@ data:
           "targets": [
             {
               "exemplar": false,
-              "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))",
+              "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\"}[1m]))",
               "format": "time_series",
               "instant": false,
               "interval": "",
@@ -1095,7 +1095,7 @@ data:
             },
             {
               "exemplar": true,
-              "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"api-$api_name.+\", resource=\"cpu\"})",
+              "expr": "sum(kube_pod_container_resource_requests{pod=~\"api-$api_name.+\", resource=\"cpu\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Total CPU Request",
@@ -1190,7 +1190,7 @@ data:
           "targets": [
             {
               "exemplar": false,
-              "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container!=\"POD\"}[1m])) /\navg(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2",
+              "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\"}[1m])) /\navg(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2",
               "format": "time_series",
               "instant": false,
               "interval": "",
@@ -1199,7 +1199,7 @@ data:
             },
             {
               "exemplar": true,
-              "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"api-$api_name.+\", resource=\"memory\"}) / 1024^2",
+              "expr": "sum(kube_pod_container_resource_requests{pod=~\"api-$api_name.+\", resource=\"memory\"}) / 1024^2",
               "hide": false,
               "interval": "",
               "legendFormat": "Total Memory Request",
@@ -1294,14 +1294,14 @@ data:
           "steppedLine": false,
           "targets": [
             {
-              "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"}) / 100",
+              "expr": "sum(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"}) / 100",
               "hide": false,
               "interval": "",
               "legendFormat": "Total GPU Usage",
               "refId": "GPU Usage"
             },
             {
-              "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"})",
+              "expr": "count(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Total GPU Capacity",
@@ -1395,7 +1395,7 @@ data:
           "steppedLine": false,
           "targets": [
             {
-              "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})",
+              "expr": "sum(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Total Used GPU Memory",
@@ -1403,7 +1403,7 @@ data:
             },
             {
               "exemplar": false,
-              "expr": "sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})",
+              "expr": "sum(DCGM_FI_DEV_FB_FREE{pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})",
               "format": "time_series",
               "instant": false,
               "interval": "",
@@ -1515,7 +1515,7 @@ data:
           "targets": [
             {
               "exemplar": false,
-              "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})",
+              "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\"})",
               "format": "time_series",
               "instant": false,
               "interval": "",
@@ -1524,7 +1524,7 @@ data:
             },
             {
               "exemplar": true,
-              "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"api-$api_name.+\", resource=\"cpu\"})\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})",
+              "expr": "sum(kube_pod_container_resource_requests{pod=~\"api-$api_name.+\", resource=\"cpu\"})\n/\nsum(kube_pod_info{pod=~\"api-$api_name.+\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Avg CPU Request",
@@ -1621,7 +1621,7 @@ data:
           "targets": [
             {
               "exemplar": false,
-              "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})",
+              "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{pod=~\"api-$api_name.+\"})",
               "format": "time_series",
               "instant": false,
               "interval": "",
@@ -1630,7 +1630,7 @@ data:
             },
             {
               "exemplar": true,
-              "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"api-$api_name.+\", resource=\"memory\"}) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})",
+              "expr": "sum(kube_pod_container_resource_requests{pod=~\"api-$api_name.+\", resource=\"memory\"}) / 1024^2\n/\nsum(kube_pod_info{pod=~\"api-$api_name.+\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Avg Memory Request",
@@ -1726,14 +1726,14 @@ data:
           "steppedLine": false,
           "targets": [
             {
-              "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"})",
+              "expr": "sum(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Avg GPU Usage",
               "refId": "GPU Usage"
             },
             {
-              "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"}) by (exported_pod))",
+              "expr": "count(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"}) by (pod))",
               "hide": false,
               "interval": "",
               "legendFormat": "Avg GPU Capacity",
@@ -1829,7 +1829,7 @@ data:
           "steppedLine": false,
           "targets": [
             {
-              "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})",
+              "expr": "sum(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Avg Used GPU Memory",
@@ -1837,7 +1837,7 @@ data:
             },
             {
               "exemplar": false,
-              "expr": "(sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})",
+              "expr": "(sum(DCGM_FI_DEV_FB_FREE{pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})",
               "format": "time_series",
               "instant": false,
               "interval": "",
diff --git a/manager/manifests/grafana/grafana-dashboard-batch.yaml b/manager/manifests/grafana/grafana-dashboard-batch.yaml
index da99fecb12..413dc5325e 100644
--- a/manager/manifests/grafana/grafana-dashboard-batch.yaml
+++ b/manager/manifests/grafana/grafana-dashboard-batch.yaml
@@ -522,7 +522,7 @@ data:
           "targets": [
             {
               "exemplar": false,
-              "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))",
+              "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\"}[1m]))",
               "format": "time_series",
               "instant": false,
               "interval": "",
@@ -531,7 +531,7 @@ data:
             },
             {
               "exemplar": true,
-              "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"$api_name.+\", resource=\"cpu\"})",
+              "expr": "sum(kube_pod_container_resource_requests{pod=~\"$api_name.+\", resource=\"cpu\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Total CPU Request",
@@ -628,7 +628,7 @@ data:
           "targets": [
             {
               "exemplar": false,
-              "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2",
+              "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2",
               "format": "time_series",
               "instant": false,
               "interval": "",
@@ -637,7 +637,7 @@ data:
             },
             {
               "exemplar": true,
-              "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"$api_name.+\", resource=\"memory\"}) / 1024^2",
+              "expr": "sum(kube_pod_container_resource_requests{pod=~\"$api_name.+\", resource=\"memory\"}) / 1024^2",
               "hide": false,
               "interval": "",
               "legendFormat": "Total Memory Request",
@@ -734,14 +734,14 @@ data:
           "steppedLine": false,
           "targets": [
             {
-              "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) / 100",
+              "expr": "sum(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"}) / 100",
               "hide": false,
               "interval": "",
               "legendFormat": "Total GPU Usage",
               "refId": "GPU Usage"
             },
             {
-              "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})",
+              "expr": "count(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Total GPU Capacity",
@@ -837,7 +837,7 @@ data:
           "steppedLine": false,
           "targets": [
             {
-              "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})",
+              "expr": "sum(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Total Used GPU Memory",
@@ -845,7 +845,7 @@ data:
             },
             {
               "exemplar": false,
-              "expr": "sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})",
+              "expr": "sum(DCGM_FI_DEV_FB_FREE{pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})",
               "format": "time_series",
               "instant": false,
               "interval": "",
@@ -963,7 +963,7 @@ data:
           "targets": [
             {
               "exemplar": false,
-              "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})",
+              "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\"}[1m]))\n/\nsum(kube_pod_info{pod=~\"$api_name.+\"})",
               "format": "time_series",
               "instant": false,
               "interval": "",
@@ -972,7 +972,7 @@ data:
             },
             {
               "exemplar": true,
-              "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"$api_name.+\", resource=\"cpu\"})\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})",
+              "expr": "sum(kube_pod_container_resource_requests{pod=~\"$api_name.+\", resource=\"cpu\"})\n/\nsum(kube_pod_info{pod=~\"$api_name.+\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Avg CPU Request",
@@ -1071,7 +1071,7 @@ data:
           "targets": [
             {
               "exemplar": false,
-              "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})",
+              "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{pod=~\"$api_name.+\"})",
               "format": "time_series",
               "instant": false,
               "interval": "",
@@ -1080,7 +1080,7 @@ data:
             },
             {
               "exemplar": true,
-              "expr": "sum(kube_pod_container_resource{exported_pod=~\"$api_name.+\", resource=\"memory\"}) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})",
+              "expr": "sum(kube_pod_container_resource{pod=~\"$api_name.+\", resource=\"memory\"}) / 1024^2\n/\nsum(kube_pod_info{pod=~\"$api_name.+\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Avg Memory Request",
@@ -1179,7 +1179,7 @@ data:
           "steppedLine": false,
           "targets": [
             {
-              "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})",
+              "expr": "sum(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"})",
               "hide": false,
               "instant": false,
               "interval": "",
@@ -1187,7 +1187,7 @@ data:
               "refId": "GPU Usage"
             },
             {
-              "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) by (exported_pod))",
+              "expr": "count(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"}) by (pod))",
               "hide": false,
               "instant": false,
               "interval": "",
@@ -1286,7 +1286,7 @@ data:
           "steppedLine": false,
           "targets": [
             {
-              "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})",
+              "expr": "sum(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Avg Used GPU Memory",
@@ -1294,7 +1294,7 @@ data:
             },
             {
               "exemplar": false,
-              "expr": "(sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})",
+              "expr": "(sum(DCGM_FI_DEV_FB_FREE{pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})",
               "format": "time_series",
               "instant": false,
               "interval": "",
diff --git a/manager/manifests/grafana/grafana-dashboard-cluster.yaml b/manager/manifests/grafana/grafana-dashboard-cluster.yaml
index 0e819f7e16..c14d088ad1 100644
--- a/manager/manifests/grafana/grafana-dashboard-cluster.yaml
+++ b/manager/manifests/grafana/grafana-dashboard-cluster.yaml
@@ -213,7 +213,7 @@ data:
           "targets": [
             {
               "exemplar": true,
-              "expr": "sum(kube_pod_info{exported_pod!~\"(aws-node|grafana|autoscaler|cluster-autoscaler|coredns|event-exporter|fluent-bit|kube-proxy|k8s-neuron-scheduler|kube-state-metrics|metrics-server|node-exporter|operator|operator-controller-manager|prometheus-operator|prometheus-prometheus|prometheus-statsd-exporter|dcgm-exporter|ingressgateway|istiod|activator|enqueuer|gateway|nvidia-device-plugin-daemonset|neuron-device-plugin-daemonset)-(.+)\"})",
+              "expr": "sum(kube_pod_info{pod!~\"(aws-node|grafana|autoscaler|cluster-autoscaler|coredns|event-exporter|fluent-bit|kube-proxy|k8s-neuron-scheduler|kube-state-metrics|metrics-server|node-exporter|operator|operator-controller-manager|prometheus-operator|prometheus-prometheus|prometheus-statsd-exporter|dcgm-exporter|ingressgateway|istiod|activator|enqueuer|gateway|nvidia-device-plugin-daemonset|neuron-device-plugin-daemonset)-(.+)\"})",
               "format": "time_series",
               "interval": "",
               "intervalFactor": 2,
diff --git a/manager/manifests/grafana/grafana-dashboard-realtime.yaml b/manager/manifests/grafana/grafana-dashboard-realtime.yaml
index 97e91cc318..37b942add1 100644
--- a/manager/manifests/grafana/grafana-dashboard-realtime.yaml
+++ b/manager/manifests/grafana/grafana-dashboard-realtime.yaml
@@ -1193,7 +1193,7 @@ data:
           "targets": [
             {
               "exemplar": false,
-              "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))",
+              "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\"}[1m]))",
               "format": "time_series",
               "instant": false,
               "interval": "",
@@ -1202,7 +1202,7 @@ data:
             },
             {
               "exemplar": true,
-              "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"api-$api_name.+\", resource=\"cpu\"})",
+              "expr": "sum(kube_pod_container_resource_requests{pod=~\"api-$api_name.+\", resource=\"cpu\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Total CPU Request",
@@ -1299,7 +1299,7 @@ data:
           "targets": [
             {
               "exemplar": false,
-              "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container!=\"POD\"}[1m])) /\navg(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2",
+              "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\"}[1m])) /\navg(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2",
               "format": "time_series",
               "instant": false,
               "interval": "",
@@ -1308,7 +1308,7 @@ data:
             },
             {
               "exemplar": true,
-              "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"api-$api_name.+\", resource=\"memory\"}) / 1024^2",
+              "expr": "sum(kube_pod_container_resource_requests{pod=~\"api-$api_name.+\", resource=\"memory\"}) / 1024^2",
               "hide": false,
               "interval": "",
               "legendFormat": "Total Memory Request",
@@ -1406,7 +1406,7 @@ data:
           "targets": [
             {
               "exemplar": true,
-              "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"}) / 100",
+              "expr": "sum(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"}) / 100",
               "hide": false,
               "interval": "",
               "legendFormat": "Total GPU Usage",
@@ -1414,7 +1414,7 @@ data:
             },
             {
               "exemplar": true,
-              "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"})",
+              "expr": "count(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Total GPU Capacity",
@@ -1511,7 +1511,7 @@ data:
           "targets": [
             {
               "exemplar": true,
-              "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})",
+              "expr": "sum(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Total Used GPU Memory",
@@ -1519,7 +1519,7 @@ data:
             },
             {
               "exemplar": false,
-              "expr": "sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})",
+              "expr": "sum(DCGM_FI_DEV_FB_FREE{pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})",
               "format": "time_series",
               "instant": false,
               "interval": "",
@@ -1633,7 +1633,7 @@ data:
           "targets": [
             {
               "exemplar": false,
-              "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})",
+              "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\"}[1m]))\n/\nsum(kube_pod_info{pod=~\"api-$api_name.+\"})",
               "format": "time_series",
               "instant": false,
               "interval": "",
@@ -1642,7 +1642,7 @@ data:
             },
             {
               "exemplar": true,
-              "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"api-$api_name.+\", resource=\"cpu\"})\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})",
+              "expr": "sum(kube_pod_container_resource_requests{pod=~\"api-$api_name.+\", resource=\"cpu\"})\n/\nsum(kube_pod_info{pod=~\"api-$api_name.+\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Avg CPU Request",
@@ -1741,7 +1741,7 @@ data:
           "targets": [
             {
               "exemplar": false,
-              "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})",
+              "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{pod=~\"api-$api_name.+\"})",
               "format": "time_series",
               "instant": false,
               "interval": "",
@@ -1750,7 +1750,7 @@ data:
             },
             {
               "exemplar": true,
-              "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"api-$api_name.+\", resource=\"memory\"}) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})",
+              "expr": "sum(kube_pod_container_resource_requests{pod=~\"api-$api_name.+\", resource=\"memory\"}) / 1024^2\n/\nsum(kube_pod_info{pod=~\"api-$api_name.+\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Avg Memory Request",
@@ -1848,14 +1848,14 @@ data:
           "steppedLine": false,
           "targets": [
             {
-              "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"})",
+              "expr": "sum(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Avg GPU Usage",
               "refId": "GPU Usage"
             },
             {
-              "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"}) by (exported_pod))",
+              "expr": "count(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"}) by (pod))",
               "hide": false,
               "interval": "",
               "legendFormat": "Avg GPU Capacity",
@@ -1953,7 +1953,7 @@ data:
           "steppedLine": false,
           "targets": [
             {
-              "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})",
+              "expr": "sum(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Avg Used GPU Memory",
@@ -1961,7 +1961,7 @@ data:
             },
             {
               "exemplar": false,
-              "expr": "(sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})",
+              "expr": "(sum(DCGM_FI_DEV_FB_FREE{pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})",
               "format": "time_series",
               "instant": false,
               "interval": "",
diff --git a/manager/manifests/grafana/grafana-dashboard-task.yaml b/manager/manifests/grafana/grafana-dashboard-task.yaml
index 1305d94407..070aaa1813 100644
--- a/manager/manifests/grafana/grafana-dashboard-task.yaml
+++ b/manager/manifests/grafana/grafana-dashboard-task.yaml
@@ -512,7 +512,7 @@ data:
           "targets": [
             {
               "exemplar": false,
-              "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))",
+              "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\"}[1m]))",
               "format": "time_series",
               "instant": false,
               "interval": "",
@@ -521,7 +521,7 @@ data:
             },
             {
               "exemplar": true,
-              "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"$api_name.+\", resource=\"cpu\"})",
+              "expr": "sum(kube_pod_container_resource_requests{pod=~\"$api_name.+\", resource=\"cpu\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Total CPU Request",
@@ -618,7 +618,7 @@ data:
           "targets": [
             {
               "exemplar": false,
-              "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"\"}[1m])) / 1024^2",
+              "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\"}[1m])) / 1024^2",
               "format": "time_series",
               "instant": false,
               "interval": "",
@@ -627,7 +627,7 @@ data:
             },
             {
               "exemplar": true,
-              "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"$api_name.+\", resource=\"memory\"}) / 1024^2",
+              "expr": "sum(kube_pod_container_resource_requests{pod=~\"$api_name.+\", resource=\"memory\"}) / 1024^2",
               "hide": false,
               "interval": "",
               "legendFormat": "Total Memory Request",
@@ -724,14 +724,14 @@ data:
           "steppedLine": false,
           "targets": [
             {
-              "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) / 100",
+              "expr": "sum(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"}) / 100",
               "hide": false,
               "interval": "",
               "legendFormat": "Total GPU Usage",
               "refId": "GPU Usage"
             },
             {
-              "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})",
+              "expr": "count(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Total GPU Capacity",
@@ -827,7 +827,7 @@ data:
           "steppedLine": false,
           "targets": [
             {
-              "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})",
+              "expr": "sum(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Total Used GPU Memory",
@@ -835,7 +835,7 @@ data:
             },
             {
               "exemplar": false,
-              "expr": "sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})",
+              "expr": "sum(DCGM_FI_DEV_FB_FREE{pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})",
               "format": "time_series",
               "instant": false,
               "interval": "",
@@ -949,7 +949,7 @@ data:
           "targets": [
             {
               "exemplar": false,
-              "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})",
+              "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\"}[1m]))\n/\nsum(kube_pod_info{pod=~\"$api_name.+\"})",
               "format": "time_series",
               "instant": false,
               "interval": "",
@@ -958,7 +958,7 @@ data:
             },
             {
               "exemplar": true,
-              "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"$api_name.+\", resource=\"cpu\"})\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})",
+              "expr": "sum(kube_pod_container_resource_requests{pod=~\"$api_name.+\", resource=\"cpu\"})\n/\nsum(kube_pod_info{pod=~\"$api_name.+\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Avg CPU Request",
@@ -1057,7 +1057,7 @@ data:
           "targets": [
             {
               "exemplar": false,
-              "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})",
+              "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{pod=~\"$api_name.+\"})",
               "format": "time_series",
               "instant": false,
               "interval": "",
@@ -1066,7 +1066,7 @@ data:
             },
             {
               "exemplar": true,
-              "expr": "sum(kube_pod_container_resource{exported_pod=~\"$api_name.+\", resource=\"memory\"}) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})",
+              "expr": "sum(kube_pod_container_resource{pod=~\"$api_name.+\", resource=\"memory\"}) / 1024^2\n/\nsum(kube_pod_info{pod=~\"$api_name.+\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Avg Memory Request",
@@ -1165,7 +1165,7 @@ data:
           "steppedLine": false,
           "targets": [
             {
-              "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})",
+              "expr": "sum(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"})",
               "hide": false,
               "instant": false,
               "interval": "",
@@ -1173,7 +1173,7 @@ data:
               "refId": "GPU Usage"
             },
             {
-              "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) by (exported_pod))",
+              "expr": "count(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"}) by (pod))",
               "hide": false,
               "instant": false,
               "interval": "",
@@ -1272,7 +1272,7 @@ data:
           "steppedLine": false,
           "targets": [
             {
-              "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})",
+              "expr": "sum(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Avg Used GPU Memory",
@@ -1280,7 +1280,7 @@ data:
             },
             {
               "exemplar": false,
-              "expr": "(sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})",
+              "expr": "(sum(DCGM_FI_DEV_FB_FREE{pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})",
               "format": "time_series",
               "instant": false,
               "interval": "",

From 79068ac074304884a961a77b025381bdc0cf12cb Mon Sep 17 00:00:00 2001
From: Robert Lucian Chiriac <robert.lucian.chiriac@gmail.com>
Date: Wed, 28 Jul 2021 22:36:28 +0300
Subject: [PATCH 4/5] Update prometheus.md

---
 dev/prometheus.md | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/dev/prometheus.md b/dev/prometheus.md
index a9b90b03c1..10f81130e2 100644
--- a/dev/prometheus.md
+++ b/dev/prometheus.md
@@ -51,21 +51,16 @@ The following is a list of metrics that are currently in use.
 #### Kubelet metrics
 1. container_cpu_usage_seconds_total with the following labels:
     1. pod
-    1. container
-    1. name
 1. container_memory_working_set_bytes with the following labels:
     1. pod
-    1. name
-    1. container
 
 #### Kube-state-metrics metrics
 
 1. kube_pod_container_resource_requests with the following labels:
-    1. exported_pod
+    1. pod
     1. resource
-    1. exported_container (required for not dropping the values for each container of each pod)
 1. kube_pod_info with the following labels:
-    1. exported_pod
+    1. pod
 1. kube_deployment_status_replicas_available with the following labels:
     1. deployment
 1. kube_job_status_active with the following labels:
@@ -74,11 +69,11 @@ The following is a list of metrics that are currently in use.
 #### DCGM metrics
 
 1. DCGM_FI_DEV_GPU_UTIL with the following labels:
-    1. exported_pod
+    1. pod
 1. DCGM_FI_DEV_FB_USED with the following labels:
-    1. exported_pod
+    1. pod
 1. DCGM_FI_DEV_FB_FREE with the following labels:
-    1. exported_pod
+    1. pod
 
 #### Node metrics
 

From b00b54f5dc861d41ed9699fd2858d674cf3dbea2 Mon Sep 17 00:00:00 2001
From: Robert Lucian Chiriac <robert.lucian.chiriac@gmail.com>
Date: Wed, 28 Jul 2021 22:39:51 +0300
Subject: [PATCH 5/5] Make lint

---
 manager/manifests/prometheus-dcgm-exporter.yaml      | 2 +-
 manager/manifests/prometheus-kube-state-metrics.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/manager/manifests/prometheus-dcgm-exporter.yaml b/manager/manifests/prometheus-dcgm-exporter.yaml
index f3eb4232f7..d5f2c6a774 100644
--- a/manager/manifests/prometheus-dcgm-exporter.yaml
+++ b/manager/manifests/prometheus-dcgm-exporter.yaml
@@ -128,7 +128,7 @@ spec:
       - action: labelkeep
         regex: (__name__|pod)
       relabelings:
-      - sourceLabels: 
+      - sourceLabels:
         - exported_pod
         action: replace
         targetLabel: pod
diff --git a/manager/manifests/prometheus-kube-state-metrics.yaml b/manager/manifests/prometheus-kube-state-metrics.yaml
index a42e78a7f2..8b1b1412ff 100644
--- a/manager/manifests/prometheus-kube-state-metrics.yaml
+++ b/manager/manifests/prometheus-kube-state-metrics.yaml
@@ -272,7 +272,7 @@ spec:
         sourceLabels:
         - pod
       relabelings:
-      - sourceLabels: 
+      - sourceLabels:
         - exported_pod
         action: replace
         targetLabel: pod