diff --git a/build/package/ceems_exporter/ceems_exporter.service b/build/package/ceems_exporter/ceems_exporter.service index 8840dfa..3993c77 100644 --- a/build/package/ceems_exporter/ceems_exporter.service +++ b/build/package/ceems_exporter/ceems_exporter.service @@ -18,8 +18,8 @@ StartLimitInterval=0 ProtectHome=read-only -AmbientCapabilities=CAP_SYS_PTRACE CAP_DAC_READ_SEARCH CAP_SETUID CAP_SETGID CAP_BPF CAP_PERFMON CAP_SYS_RESOURCE -CapabilityBoundingSet=CAP_SYS_PTRACE CAP_DAC_READ_SEARCH CAP_SETUID CAP_SETGID CAP_BPF CAP_PERFMON CAP_SYS_RESOURCE +AmbientCapabilities=CAP_SYS_PTRACE CAP_DAC_READ_SEARCH CAP_SETUID CAP_SETGID CAP_DAC_OVERRIDE CAP_BPF CAP_PERFMON CAP_SYS_RESOURCE +CapabilityBoundingSet=CAP_SYS_PTRACE CAP_DAC_READ_SEARCH CAP_SETUID CAP_SETGID CAP_DAC_OVERRIDE CAP_BPF CAP_PERFMON CAP_SYS_RESOURCE ProtectSystem=strict ProtectControlGroups=true diff --git a/etc/nvidia-dcgm-exporter/counters.csv b/etc/nvidia-dcgm-exporter/counters.csv index 1652843..b708b4f 100644 --- a/etc/nvidia-dcgm-exporter/counters.csv +++ b/etc/nvidia-dcgm-exporter/counters.csv @@ -38,6 +38,7 @@ DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encounte # Memory usage DCGM_FI_DEV_FB_FREE, gauge, Frame buffer memory free (in MB). DCGM_FI_DEV_FB_USED, gauge, Frame buffer memory used (in MB). +DCGM_FI_DEV_FB_RESERVED, gauge, Frame buffer memory reserved (in MB). # ECC # DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors. diff --git a/etc/prometheus/rules/gpu.rules b/etc/prometheus/rules/gpu.rules index 59133c5..d96bdf0 100644 --- a/etc/prometheus/rules/gpu.rules +++ b/etc/prometheus/rules/gpu.rules @@ -38,8 +38,21 @@ groups: expr: avg by (job)(avg_over_time(DCGM_FI_DEV_GPU_UTIL{job="sample-dcgm"}[1d:15m])) # Average GPU memory usage during last 1h - - record: instance:DCGM_FI_DEV_MEM_COPY_UTIL:avg1h - expr: avg by (job)(avg_over_time(DCGM_FI_DEV_MEM_COPY_UTIL{job="sample-dcgm"}[1d:15m])) + - record: instance:DCGM_FI_DEV_MEM_UTIL:avg1h + expr: |2 + avg by (job) ( + avg_over_time( + ( + (sum by (Hostname) (DCGM_FI_DEV_FB_USED{job="sample-dcgm"})) + / + ( + sum by (Hostname) (DCGM_FI_DEV_FB_USED{job="sample-dcgm"}) + + + sum by (Hostname) (DCGM_FI_DEV_FB_FREE{job="sample-dcgm"}) + ) + )[1h:15m] + ) + ) # Total energy usage during last 1h in kWh # PUE of 1 is used by default. Use appropriate PUE by replacing 1 by PUE ratio in expr