diff --git a/Makefile b/Makefile index 0bb0dd3c..8d5a3296 100644 --- a/Makefile +++ b/Makefile @@ -201,7 +201,7 @@ skip-checkmetrics: $(PROMTOOL) .PHONY: checkrules checkrules: $(PROMTOOL) @echo ">> checking rules for correctness" - find . -name "*rules*.yml" | xargs -I {} $(PROMTOOL) check rules {} + find . -name "*.rules" | xargs -I {} $(PROMTOOL) check rules {} .PHONY: skip-checkrules skip-checkrules: $(PROMTOOL) diff --git a/README.md b/README.md index 128c5bec..a69da7a3 100644 --- a/README.md +++ b/README.md @@ -176,7 +176,7 @@ SLURM job properties can be used here as well Currently the exporter supports few different ways to get these job properties. - Use prolog and epilog scripts to get the GPU to job ID map. Example prolog script -is provided in the [repo](./configs/slurm/prolog.d/gpujobmap.sh). Similarly, this approach +is provided in the [repo](./etc/slurm/prolog.d/gpujobmap.sh). Similarly, this approach needs `--collector.slurm.gpu.job.map.path=/run/gpujobmap` command line option. - Reading env vars from `/proc`: If the file created by prolog script cannot be found, diff --git a/build/package/ceems_api_server/tsdb-config.yml b/build/package/ceems_api_server/tsdb-config.yml index 0b79cd8e..8b85d4cd 100644 --- a/build/package/ceems_api_server/tsdb-config.yml +++ b/build/package/ceems_api_server/tsdb-config.yml @@ -28,9 +28,9 @@ queries: {} # avg_over_time( # avg by (uuid) ( # ( - # rate(ceems_compute_unit_cpu_user_seconds{uuid=~"{{.UUIDs}}"}[{{.RateInterval}}]) + # rate(ceems_compute_unit_cpu_user_seconds_total{uuid=~"{{.UUIDs}}"}[{{.RateInterval}}]) # + - # rate(ceems_compute_unit_cpu_system_seconds{uuid=~"{{.UUIDs}}"}[{{.RateInterval}}]) + # rate(ceems_compute_unit_cpu_system_seconds_total{uuid=~"{{.UUIDs}}"}[{{.RateInterval}}]) # ) # / # ceems_compute_unit_cpus{uuid=~"{{.UUIDs}}"} diff --git a/configs/nvidia-dcgm-exporter/counters.csv b/etc/nvidia-dcgm-exporter/counters.csv similarity index 100% rename from configs/nvidia-dcgm-exporter/counters.csv rename to etc/nvidia-dcgm-exporter/counters.csv diff --git a/etc/prometheus/rules/cpu-gpu-nodes.rules b/etc/prometheus/rules/cpu-gpu-nodes.rules new file mode 100644 index 00000000..0485c029 --- /dev/null +++ b/etc/prometheus/rules/cpu-gpu-nodes.rules @@ -0,0 +1,265 @@ +--- +# Recording rules for nodes with CPU and GPU +# +# This example is applicable for nVIDIA GPUs and metrics reported by nVIDIA-DCGM exporter +# The core idea should be applicable for AMD GPUs as well using the power usage reported +# by AMD SMI exporter. +# +# We can leverage the recording rules to estimate the power consumption of each compute +# unit. We get node level power consumption from IPMI DCMI readings and we should split +# this global power comsumption into consumption of each compute unit. The idea here is +# to use RAPL power readings that report power consumption by CPU and DRAM to split +# the power reading of IPMI DCMI to power consumption of CPU and DRAM. There is no easy +# way to split the power for rest of the components like network, storage, fans (if they exist). +# +# We can also introduce the Power Usage Effectiveness (PUE) ratio in the power usage +# estimation of compute units in these recording rules. +# +# Energy consumption per compute unit is estimated with following assumptions: +# +# Firstly, we make an assumption that 90% of power is consumed by CPU, DRAM and 10% by network +# We ignore storage as they come from external storage clusters and seldom using local +# disks +# +# We leverage RAPL package and DRAM to split the rest of 90% power between CPU and DRAM +# components. +# +# At node level, power consumed by CPU and DRAM can be estimated as +# +# node_cpu_power = 0.9 * ipmi_power * (rapl_package/(rapl_package * rapl_dram)) +# node_dram_power = 0.9 * ipmi_power * (rapl_dram/(rapl_package * rapl_dram)) +# +# Now we have power usage at node level for CPU and DRAM. We split it further at the +# compute unit level using CPU time and DRAM usage by the compute unit. For network power usage, we split +# total network power usage equally among all compute units that running on the node +# at a given time. +# +# compute_unit_cpu_power = node_cpu_power * (total_compute_unit_cpu_sec / total_node_cpu_sec) +# compute_unit_dram_power = node_dram_power * (total_compute_unit_mem_usage / total_node_mem_usage) +# compute_unit_net_power = 0.1 * ipmi_power / num_compute_units +# +# Total power usage of compute unit = compute_unit_cpu_power + compute_unit_dram_power + compute_unit_net_power +# +# Finally, we can introduce PUE into the energy consumption by multiplying it with +# Total compute unit power usage. +# +# Total power usage of compute unit = PUE * (compute_unit_cpu_power + compute_unit_dram_power + compute_unit_net_power) +# +# IMPORTANT: In rate estimation a rate interval of 2m is used. This is based on scrape +# duration of 30s. It is always desirable to have rate interval atleast 4 times the +# scrape interval. So, if scrape interval changes, we need to change rate interval +# in the rules. +# +groups: + - name: sample-gpu + rules: + # In some cases, power reported by IPMI DCMI includes both CPU and GPU power + # We need to remove power reported by DCGM from IPMI DCMI to get CPU "only" power + # consumption. + # + # We take average power over 5m reported by DCGM to smoothen any "unnatural" peaks. + # + # IMPORTANT: DCGM reports power usage by modelling. So there can be times where + # power reported by ensemble of GPUs by DCGM exceeds power reported by IPMI. In this + # case the query tends to give negative values which is not possible. + # To avoid such situations, we use a condition >=0 to the query where the negative + # values will be filtered from query result. They will appear as gaps in TSDB + # We can fix it on Grafana side by choosing to join the time series gaps in panel + # options. + # Not ideal, but we dont have much options here!! + # + # PUE of 1 is used by default. Use appropriate PUE by replacing 1 by PUE ratio in expr + - record: instance:ceems_ipmi_dcmi_avg_watts:pue_avg + expr: 1 * (ceems_ipmi_dcmi_avg_watts{job="sample-gpu"} - on(instance) group_left() sum by (instance) (avg_over_time(DCGM_FI_DEV_POWER_USAGE{job="sample-dcgm"}[5m:])) >= 0) + + # Unit's CPU total energy usage estimated from IPMI DCMI + # A100 partition does not report RAPL CPU and DRAM power usage. So for this + # partition we make a assumption of 70% CPU, 20% DRAM and 10% network + - record: unit:ceems_compute_unit_cpu_energy_usage:sum + expr: |2 + 0.7 * instance:ceems_ipmi_dcmi_avg_watts:pue_avg{job="sample-gpu"} # Assumption 90% Power usage by CPU and DRAM and 10% by network. + * on (instance) group_right () # CPU Power usage * (Job CPU Time / Total CPU Time) -> CPU power usage by "Job" + ( + ( + rate(ceems_compute_unit_cpu_user_seconds_total{job="sample-gpu"}[2m]) + + + rate(ceems_compute_unit_cpu_system_seconds_total{job="sample-gpu"}[2m]) + ) + / on (instance) group_left () + sum by (instance) (rate(ceems_cpu_seconds_total{job="sample-gpu",mode!~"idle|iowait|steal"}[2m])) + ) + + + 0.2 * instance:ceems_ipmi_dcmi_avg_watts:pue_avg{job="sample-gpu"} + * on (instance) group_right () # DRAM Power usage * (Job Memory used / Total Memory used) -> DRAM power usage by "Job" + ( + ceems_compute_unit_memory_used_bytes{job="sample-gpu"} + / on (instance) group_left () + ( + ceems_meminfo_MemTotal_bytes{job="sample-gpu"} + - on (instance) + ceems_meminfo_MemAvailable_bytes{job="sample-gpu"} + ) + ) + + + 0.1 * instance:ceems_ipmi_dcmi_avg_watts:pue_avg{job="sample-gpu"} + * on (instance) group_right () # Network Power usage / Number of Jobs -> Network power usage by "Job" + ( + ceems_compute_unit_memory_used_bytes{job="sample-gpu"} + / + ( + ceems_compute_unit_memory_used_bytes{job="sample-gpu"} + * on (instance) group_left () + ceems_compute_units{job="sample-gpu"} + ) + ) + + # Rule group for aggregate metrics. Here we estimate the CPU, CPU Mem, CPU Energy, + # CPU emissions last 24hr in the intervals of 15m. + # So we get a nice rolling aggregate of usage and consumption time series + # + # NOTE that there are two different intervals in these queries: + # - Evaluation interval: It is how regularly we evaluate this rule. We dont need to + # evaluate these series for every, lets say, 30s. We want to have a global view + # of these aggregate metrics and evaluating every 15m or 30m should be ok + # + # - Query interval: It is how many data points that we use to evaluate the rules. By + # default we are using a scrape interval of 30s and hence we have data for every 30s + # However, we do not need to use such a fine interval to estimate daily metrics. Using + # such interval will also increase the query time as TSDB needs to get a lot of data into + # memory to estimate this query. So we use 15m as query interval which is a reasonable + # for estimating aggregate metrics for 24h. + - name: sample-gpu-agg + interval: 15m + rules: + # Average CPU usage during last 24h + - record: instance:ceems_cpu_usage:avg24h + expr: |2 + avg by (job)( + avg_over_time( + ( + sum by (instance) (rate(ceems_cpu_seconds_total{job="sample-gpu",mode!~"idle|iowait|steal"}[2m])) + * + 100 + / on (instance) group_right() + ceems_cpu_count{job="sample-gpu"} + )[1d:15m] + ) + ) + + # Average CPU memory usage during last 24h + - record: instance:ceems_cpu_mem_usage:avg24h + expr: |2 + avg by (job)( + avg_over_time( + ( + ( + 1 + - + ( + ceems_meminfo_MemAvailable_bytes{job="sample-gpu"} / ceems_meminfo_MemTotal_bytes{job="sample-gpu"} + ) + ) + )[1d:15m] + ) + ) + * + 100 + + # Total energy usage during last 24h in kWh + # PUE of 1 is used by default. Use appropriate PUE by replacing 1 by PUE ratio in expr + # We are summing all the IPMI power reading during last 24h with a interval of 15m. + # So, to estimate the total energy, we need to do sum(P*deltat) / 3.6e9 where P is power + # usage and deltat is the interval in milliseconds + - record: instance:ceems_cpu_energy_usage:pue_sum24h + expr: |2 + sum by (job)( + sum_over_time( + ( + 1 + * + ( + ceems_ipmi_dcmi_avg_watts{job="sample-gpu"} + - on (instance) group_left () + sum by (instance) (avg_over_time(DCGM_FI_DEV_POWER_USAGE{job="sample-dcgm"}[5m:])) + ) >= 0 + )[1d:15m] + ) + ) + * + 15*60000 + / + 3.6e+09 + + # Total emissions from CPU during last 24h in gms using emission factor from RTE + # PUE of 1 is used by default. Use appropriate PUE by replacing 1 by PUE ratio in expr + - record: instance:ceems_cpu_emissions_rte:pue_sum24h + expr: |2 + sum by (job) ( + sum_over_time( + ( + label_replace( + 1 + * + ( + ( + ceems_ipmi_dcmi_avg_watts{job="sample-gpu"} + - on (instance) group_left () + sum by (instance) (avg_over_time(DCGM_FI_DEV_POWER_USAGE{job="sample-dcgm"}[5m:])) + ) + >= + 0 + ) + * + 15*60000 + / + 3.6e+09, + "common_label", + "mock", + "hostname", + "(.*)" + ) + * on (common_label) group_left () + label_replace(ceems_emissions_gCo2_kWh{provider="rte"}, "common_label", "mock", "hostname", "(.*)") + )[1d:15m] + ) + ) + + # Total emissions from CPU during last 24h in gms using emission factor from Electricty Maps + # PUE of 1 is used by default. Use appropriate PUE by replacing 1 by PUE ratio in expr + - record: instance:ceems_cpu_emissions_emaps:pue_sum24h + expr: |2 + sum by (job) ( + sum_over_time( + ( + label_replace( + 1 + * + ( + ( + ceems_ipmi_dcmi_avg_watts{job="sample-gpu"} + - on (instance) group_left () + sum by (instance) (avg_over_time(DCGM_FI_DEV_POWER_USAGE{job="sample-dcgm"}[5m:])) + ) + >= + 0 + ) + * + 15*60000 + / + 3.6e+09, + "common_label", + "mock", + "hostname", + "(.*)" + ) + * on (common_label) group_left () + label_replace( + ceems_emissions_gCo2_kWh{provider="emaps"}, + "common_label", + "mock", + "hostname", + "(.*)" + ) + )[1d:15m] + ) + ) diff --git a/etc/prometheus/rules/cpu-only-nodes.rules b/etc/prometheus/rules/cpu-only-nodes.rules new file mode 100644 index 00000000..9e090a99 --- /dev/null +++ b/etc/prometheus/rules/cpu-only-nodes.rules @@ -0,0 +1,212 @@ +--- +# Recording rules for CPU-only nodes +# +# We can leverage the recording rules to estimate the power consumption of each compute +# unit. We get node level power consumption from IPMI DCMI readings and we should split +# this global power comsumption into consumption of each compute unit. The idea here is +# to use RAPL power readings that report power consumption by CPU and DRAM to split +# the power reading of IPMI DCMI to power consumption of CPU and DRAM. There is no easy +# way to split the power for rest of the components like network, storage, fans (if they exist). +# +# We can also introduce the Power Usage Effectiveness (PUE) ratio in the power usage +# estimation of compute units in these recording rules. +# +# Energy consumption per compute unit is estimated with following assumptions: +# +# Firstly, we make an assumption that 90% of power is consumed by CPU, DRAM and 10% by network +# We ignore storage as they come from external storage clusters and seldom using local +# disks +# +# We leverage RAPL package and DRAM to split the rest of 90% power between CPU and DRAM +# components. +# +# At node level, power consumed by CPU and DRAM can be estimated as +# +# node_cpu_power = 0.9 * ipmi_power * (rapl_package/(rapl_package * rapl_dram)) +# node_dram_power = 0.9 * ipmi_power * (rapl_dram/(rapl_package * rapl_dram)) +# +# Now we have power usage at node level for CPU and DRAM. We split it further at the +# compute unit level using CPU time and DRAM usage by the compute unit. For network power usage, we split +# total network power usage equally among all compute units that running on the node +# at a given time. +# +# compute_unit_cpu_power = node_cpu_power * (total_compute_unit_cpu_sec / total_node_cpu_sec) +# compute_unit_dram_power = node_dram_power * (total_compute_unit_mem_usage / total_node_mem_usage) +# compute_unit_net_power = 0.1 * ipmi_power / num_compute_units +# +# Total power usage of compute unit = compute_unit_cpu_power + compute_unit_dram_power + compute_unit_net_power +# +# Finally, we can introduce PUE into the energy consumption by multiplying it with +# Total compute unit power usage. +# +# Total power usage of compute unit = PUE * (compute_unit_cpu_power + compute_unit_dram_power + compute_unit_net_power) +# +# IMPORTANT: In rate estimation a rate interval of 2m is used. This is based on scrape +# duration of 30s. It is always desirable to have rate interval atleast 4 times the +# scrape interval. So, if scrape interval changes, we need to change rate interval +# in the rules. +groups: + - name: sample-cpu + rules: + # PUE of 1 is used by default. Use appropriate PUE by replacing 1 by PUE ratio in expr + - record: instance:ceems_ipmi_dcmi_avg_watts:pue_avg + expr: 1 * ceems_ipmi_dcmi_avg_watts{job="sample-cpu"} + + # Unit's CPU total energy usage estimated from IPMI DCMI + - record: unit:ceems_compute_unit_cpu_energy_usage:sum + expr: |2 + 0.9 * instance:ceems_ipmi_dcmi_avg_watts:pue_avg{job="sample-cpu"} # Assumption 90% Power usage by CPU and DRAM and 10% by network. + * on (instance) group_left () # 0.9 * IPMI power * (RAPL Package / (RAPL Package + RAPL DRAM)) -> Total power usage by "CPU" + ( + sum by (instance) (rate(ceems_rapl_package_joules_total{job="sample-cpu"}[2m])) + / + ( + sum by (instance) (rate(ceems_rapl_package_joules_total{job="sample-cpu"}[2m])) + + + sum by (instance) (rate(ceems_rapl_dram_joules_total{job="sample-cpu"}[2m])) + ) + ) + * on (instance) group_right () # CPU Power usage * (Job CPU Time / Total CPU Time) -> CPU power usage by "Job" + ( + ( + rate(ceems_compute_unit_cpu_user_seconds_total{job="sample-cpu"}[2m]) + + + rate(ceems_compute_unit_cpu_system_seconds_total{job="sample-cpu"}[2m]) + ) + / on (instance) group_left () + sum by (instance) (rate(ceems_cpu_seconds_total{job="sample-cpu",mode!~"idle|iowait|steal"}[2m])) + ) + + + 0.9 * instance:ceems_ipmi_dcmi_avg_watts:pue_avg{job="sample-cpu"} + * on (instance) group_left () # 0.9 * IPMI power * (RAPL DRAM / (RAPL Package + RAPL DRAM)) -> Total power usage by "DRAM" + ( + sum by (instance) (rate(ceems_rapl_dram_joules_total{job="sample-cpu"}[2m])) + / + ( + sum by (instance) (rate(ceems_rapl_package_joules_total{job="sample-cpu"}[2m])) + + + sum by (instance) (rate(ceems_rapl_dram_joules_total{job="sample-cpu"}[2m])) + ) + ) + * on (instance) group_right () # DRAM Power usage * (Job Memory used / Total Memory used) -> DRAM power usage by "Job" + ( + ceems_compute_unit_memory_used_bytes{job="sample-cpu"} + / on (instance) group_left () + ( + ceems_meminfo_MemTotal_bytes{job="sample-cpu"} + - on (instance) + ceems_meminfo_MemAvailable_bytes{job="sample-cpu"} + ) + ) + + + 0.1 * instance:ceems_ipmi_dcmi_avg_watts:pue_avg{job="sample-cpu"} + * on (instance) group_right () # Network Power usage / Number of Jobs -> Network power usage by "Job" + ( + ceems_compute_unit_memory_used_bytes{job="sample-cpu"} + / + ( + ceems_compute_unit_memory_used_bytes{job="sample-cpu"} + * on (instance) group_left () + ceems_compute_units{job="sample-cpu"} + ) + ) + + # Rule group for aggregate metrics. Here we estimate the CPU, CPU Mem, CPU Energy and + # CPU emissions during last 24hr in the intervals of 15m. So we get a nice rolling + # aggregate of usage and consumption time series + # + # NOTE that there are two different intervals in these queries: + # - Evaluation interval: It is how regularly we evaluate this rule. We dont need to + # evaluate these series for every, lets say, 30s. We want to have a global view + # of these aggregate metrics and evaluating every 15m or 30m should be ok + # + # - Query interval: It is how many data points that we use to evaluate the rules. By + # default we are using a scrape interval of 30s and hence we have data for every 30s + # However, we do not need to use such a fine interval to estimate daily metrics. Using + # such interval will also increase the query time as TSDB needs to get a lot of data into + # memory to estimate this query. So we use 15m as query interval which is a reasonable + # for estimating aggregate metrics for 24h. + - name: sample-cpu-agg + interval: 15m + rules: + # Average CPU usage during last 24h + - record: instance:ceems_cpu_usage:avg24h + expr: |2 + avg by (job)( + avg_over_time( + ( + sum by (instance) (rate(ceems_cpu_seconds_total{job="sample-cpu",mode!~"idle|iowait|steal"}[2m])) + * + 100 + / on (instance) group_right() + ceems_cpu_count{job="sample-cpu"} + )[1d:15m] + ) + ) + + # Average CPU memory usage during last 24h + - record: instance:ceems_cpu_mem_usage:avg24h + expr: |2 + avg by (job)( + avg_over_time( + ( + ( + 1 + - + ( + ceems_meminfo_MemAvailable_bytes{job="sample-cpu"} / ceems_meminfo_MemTotal_bytes{job="sample-cpu"} + ) + ) + )[1d:15m] + ) + ) + * + 100 + + # Total energy usage during last 24h in kWh + # PUE of 1 is used by default. Use appropriate PUE by replacing 1 by PUE ratio in expr + # We are summing all the IPMI power reading during last 24h with a interval of 15m. + # So, to estimate the total energy, we need to do sum(P*deltat) / 3.6e9 where P is power + # usage and deltat is the interval in milliseconds + - record: instance:ceems_cpu_energy_usage:pue_sum24h + expr: 1 * sum by (job)(sum_over_time((ceems_ipmi_dcmi_avg_watts{job="sample-cpu"})[1d:15m])) * 15 * 60000 / 3.6e9 + + # Total emissions from CPU during last 24h in gms using emission factor from RTE + # PUE of 1 is used by default. Use appropriate PUE by replacing 1 by PUE ratio in expr + - record: instance:ceems_cpu_emissions_rte:pue_sum24h + expr: |2 + sum by (job)( + sum_over_time( + ( + label_replace( + 1 * ceems_ipmi_dcmi_avg_watts{job="sample-cpu"} * 15 * 60000 / 3.6e+09, + "common_label", + "mock", + "hostname", + "(.*)" + ) + * on (common_label) group_left () + label_replace(ceems_emissions_gCo2_kWh{provider="rte"}, "common_label", "mock", "hostname", "(.*)") + )[1d:15m] + ) + ) + + # Total emissions from CPU during last 24h in gms using emission factor from Electricty Maps + # PUE of 1 is used by default. Use appropriate PUE by replacing 1 by PUE ratio in expr + - record: instance:ceems_cpu_emissions_emaps:pue_sum24h + expr: |2 + sum by (job)( + sum_over_time( + ( + label_replace( + 1 * ceems_ipmi_dcmi_avg_watts{job="sample-cpu"} * 15 * 60000 / 3.6e+09, + "common_label", + "mock", + "hostname", + "(.*)" + ) + * on (common_label) group_left () + label_replace(ceems_emissions_gCo2_kWh{provider="emaps"}, "common_label", "mock", "hostname", "(.*)") + )[1d:15m] + ) + ) diff --git a/etc/prometheus/rules/gpu.rules b/etc/prometheus/rules/gpu.rules new file mode 100644 index 00000000..a6391db2 --- /dev/null +++ b/etc/prometheus/rules/gpu.rules @@ -0,0 +1,90 @@ +--- +# Recording rules for for GPU power usage +# +# This example is applicable for nVIDIA GPUs and metrics reported by nVIDIA-DCGM exporter +# The core idea should be applicable for AMD GPUs as well using the power usage reported +# by AMD SMI exporter. +# +# We leverage these rules to include PUE (Power Usage Effectiveness) in the Power +# estimation as well. +# +groups: + - name: sample-dcgm + rules: + # PUE of 1 is used by default + - record: instance:DCGM_FI_DEV_POWER_USAGE:pue_avg + expr: 1 * DCGM_FI_DEV_POWER_USAGE{job="sample-dcgm"} + + # Rule group for aggregate metrics. Here we estimate the GPU usage, GPU Mem usage, GPU Energy, + # GPU emissions last 24hr in the intervals of 15m. + # So we get a nice rolling aggregate of usage and consumption time series + # + # NOTE that there are two different intervals in these queries: + # - Evaluation interval: It is how regularly we evaluate this rule. We dont need to + # evaluate these series for every, lets say, 30s. We want to have a global view + # of these aggregate metrics and evaluating every 15m or 30m should be ok + # + # - Query interval: It is how many data points that we use to evaluate the rules. By + # default we are using a scrape interval of 30s and hence we have data for every 30s + # However, we do not need to use such a fine interval to estimate daily metrics. Using + # such interval will also increase the query time as TSDB needs to get a lot of data into + # memory to estimate this query. So we use 15m as query interval which is a reasonable + # for estimating aggregate metrics for 24h. + - name: sample-dcgm-agg + interval: 15m + rules: + # Average GPU usage during last 24h + - record: instance:DCGM_FI_DEV_GPU_UTIL:avg24h + expr: avg by (job)(avg_over_time(DCGM_FI_DEV_GPU_UTIL{job="sample-dcgm"}[1d:15m])) + + # Average GPU memory usage during last 24h + - record: instance:DCGM_FI_DEV_MEM_COPY_UTIL:avg24h + expr: avg by (job)(avg_over_time(DCGM_FI_DEV_MEM_COPY_UTIL{job="sample-dcgm"}[1d:15m])) + + # Total energy usage during last 24h in kWh + # PUE of 1 is used by default. Use appropriate PUE by replacing 1 by PUE ratio in expr + # We are summing all the IPMI power reading during last 24h with a interval of 15m. + # So, to estimate the total energy, we need to do sum(P*deltat) / 3.6e9 where P is power + # usage and deltat is the interval in milliseconds + - record: instance:DCGM_FI_DEV_POWER_USAGE:pue_sum24h + expr: 1 * sum by (job)(sum_over_time((DCGM_FI_DEV_POWER_USAGE{job="sample-dcgm"})[1d:15m])) * 15 * 60000 / 3.6e9 + + # Total emissions from CPU during last 24h in gms using emission factor from RTE + # PUE of 1 is used by default. Use appropriate PUE by replacing 1 by PUE ratio in expr + - record: instance:ceems_gpu_emissions_rte:pue_sum24h + expr: |2 + sum by (job)( + sum_over_time( + ( + label_replace( + 1 * DCGM_FI_DEV_POWER_USAGE{job="sample-dcgm"} * 15 * 60000 / 3.6e+09, + "common_label", + "mock", + "hostname", + "(.*)" + ) + * on (common_label) group_left () + label_replace(ceems_emissions_gCo2_kWh{provider="rte"}, "common_label", "mock", "hostname", "(.*)") + )[1d:15m] + ) + ) + + # Total emissions from CPU during last 24h in gms using emission factor from Electricty Maps + # PUE of 1 is used by default. Use appropriate PUE by replacing 1 by PUE ratio in expr + - record: instance:ceems_gpu_emissions_emaps:pue_sum24h + expr: |2 + sum by (job)( + sum_over_time( + ( + label_replace( + 1 * DCGM_FI_DEV_POWER_USAGE{job="sample-dcgm"} * 15 * 60000 / 3.6e+09, + "common_label", + "mock", + "hostname", + "(.*)" + ) + * on (common_label) group_left () + label_replace(ceems_emissions_gCo2_kWh{provider="emaps"}, "common_label", "mock", "hostname", "(.*)") + )[1d:15m] + ) + ) diff --git a/configs/slurm/epilog.d/gpujobmap.sh b/etc/slurm/epilog.d/gpujobmap.sh similarity index 100% rename from configs/slurm/epilog.d/gpujobmap.sh rename to etc/slurm/epilog.d/gpujobmap.sh diff --git a/configs/slurm/epilog.d/slurmjobprops.sh b/etc/slurm/epilog.d/slurmjobprops.sh similarity index 100% rename from configs/slurm/epilog.d/slurmjobprops.sh rename to etc/slurm/epilog.d/slurmjobprops.sh diff --git a/configs/slurm/prolog.d/gpujobmap.sh b/etc/slurm/prolog.d/gpujobmap.sh similarity index 100% rename from configs/slurm/prolog.d/gpujobmap.sh rename to etc/slurm/prolog.d/gpujobmap.sh diff --git a/configs/slurm/prolog.d/slurmjobprops.sh b/etc/slurm/prolog.d/slurmjobprops.sh similarity index 100% rename from configs/slurm/prolog.d/slurmjobprops.sh rename to etc/slurm/prolog.d/slurmjobprops.sh