Skip to content

Commit

Permalink
docs: Update README and example configs
Browse files Browse the repository at this point in the history
Signed-off-by: Mahendra Paipuri <[email protected]>
  • Loading branch information
mahendrapaipuri committed Jan 6, 2024
1 parent 862f39d commit 16d33eb
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 18 deletions.
3 changes: 2 additions & 1 deletion init/systemd/batchjob_exporter_no_privs.service
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@ Type=simple
User=batchjob-exp
Group=batchjob-exp
ExecStart=/usr/local/bin/batchjob_exporter \
--collector.slurm.gpu.type="nvidia" \
--collector.slurm.job.props.path="/run/slurmjobprops" \
--collector.slurm.nvidia.gpu.job.map.path="/run/gpujobmap" \
--collector.slurm.gpu.job.map.path="/run/gpujobmap" \
--collector.ipmi.dcmi.cmd="sudo /usr/sbin/ipmi-dcmi --get-system-power-statistics" \
--log.level="debug"

Expand Down
1 change: 1 addition & 0 deletions init/systemd/batchjob_exporter_with_caps.service
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ Type=simple
User=batchjob-exp
Group=batchjob-exp
ExecStart=/usr/local/bin/batchjob_exporter \
--collector.slurm.gpu.type="nvidia" \
--collector.ipmi.dcmi.cmd="sudo /usr/sbin/ipmi-dcmi --get-system-power-statistics" \
--log.level="debug"

Expand Down
30 changes: 13 additions & 17 deletions pkg/collector/fixtures/output/e2e-test-cgroupsv2-procfs-output.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,39 +22,35 @@ batchjob_scrape_collector_success{collector="rapl"} 1
batchjob_scrape_collector_success{collector="slurm_job"} 1
# HELP batchjob_slurm_job_cpu_system_seconds Cumulative CPU system seconds
# TYPE batchjob_slurm_job_cpu_system_seconds gauge
batchjob_slurm_job_cpu_system_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 115.777502
batchjob_slurm_job_cpu_system_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuser="testusr",jobuuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5",step="",task=""} 115.777502
# HELP batchjob_slurm_job_cpu_total_seconds Cumulative CPU total seconds
# TYPE batchjob_slurm_job_cpu_total_seconds gauge
batchjob_slurm_job_cpu_total_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 60491.070351
batchjob_slurm_job_cpu_total_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuser="testusr",jobuuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5",step="",task=""} 60491.070351
# HELP batchjob_slurm_job_cpu_user_seconds Cumulative CPU user seconds
# TYPE batchjob_slurm_job_cpu_user_seconds gauge
batchjob_slurm_job_cpu_user_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 60375.292848
batchjob_slurm_job_cpu_user_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuser="testusr",jobuuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5",step="",task=""} 60375.292848
# HELP batchjob_slurm_job_cpus Number of CPUs
# TYPE batchjob_slurm_job_cpus gauge
batchjob_slurm_job_cpus{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 2
batchjob_slurm_job_cpus{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuser="testusr",jobuuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5",step="",task=""} 2
# HELP batchjob_slurm_job_gpu_jobid_flag Indicates running job on GPU, 1=job running
# TYPE batchjob_slurm_job_gpu_jobid_flag gauge
batchjob_slurm_job_gpu_jobid_flag{UUID="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3",batch="slurm",hostname="",index="3",jobaccount="testacc",jobid="1009248",jobuser="testusr",jobuuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5",uuid="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3"} 1
batchjob_slurm_job_gpu_jobid_flag{UUID="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3",batch="slurm",hostname="",index="2",jobaccount="testacc",jobid="1009248",jobuser="testusr",jobuuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5",uuid="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3"} 1
# HELP batchjob_slurm_job_memory_cache_bytes Memory cache used in bytes
# TYPE batchjob_slurm_job_memory_cache_bytes gauge
batchjob_slurm_job_memory_cache_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0
batchjob_slurm_job_memory_cache_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuser="testusr",jobuuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5",step="",task=""} 0
# HELP batchjob_slurm_job_memory_fail_count Memory fail count
# TYPE batchjob_slurm_job_memory_fail_count gauge
batchjob_slurm_job_memory_fail_count{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0
batchjob_slurm_job_memory_fail_count{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuser="testusr",jobuuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5",step="",task=""} 0
# HELP batchjob_slurm_job_memory_rss_bytes Memory RSS used in bytes
# TYPE batchjob_slurm_job_memory_rss_bytes gauge
batchjob_slurm_job_memory_rss_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.098592768e+09
batchjob_slurm_job_memory_rss_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuser="testusr",jobuuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5",step="",task=""} 4.098592768e+09
# HELP batchjob_slurm_job_memory_total_bytes Memory total in bytes
# TYPE batchjob_slurm_job_memory_total_bytes gauge
batchjob_slurm_job_memory_total_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.294967296e+09
batchjob_slurm_job_memory_total_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuser="testusr",jobuuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5",step="",task=""} 4.294967296e+09
# HELP batchjob_slurm_job_memory_used_bytes Memory used in bytes
# TYPE batchjob_slurm_job_memory_used_bytes gauge
batchjob_slurm_job_memory_used_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.111491072e+09
# HELP batchjob_slurm_job_nvidia_gpu_jobid Batch Job ID of current nVIDIA GPU
# TYPE batchjob_slurm_job_nvidia_gpu_jobid gauge
batchjob_slurm_job_nvidia_gpu_jobid{UUID="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3",batch="slurm",hostname="",index="3",uuid="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3"} 1.009248e+06
batchjob_slurm_job_nvidia_gpu_jobid{UUID="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3",batch="slurm",hostname="",index="2",uuid="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3"} 1.009248e+06
# HELP batchjob_slurm_job_nvidia_gpu_jobid_flag Indicates running job on GPU, 1=job running
# TYPE batchjob_slurm_job_nvidia_gpu_jobid_flag gauge
batchjob_slurm_job_nvidia_gpu_jobid_flag{UUID="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3",batch="slurm",hostname="",index="3",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",uuid="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3"} 1
batchjob_slurm_job_nvidia_gpu_jobid_flag{UUID="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3",batch="slurm",hostname="",index="2",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",uuid="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3"} 1
batchjob_slurm_job_memory_used_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuser="testusr",jobuuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5",step="",task=""} 4.111491072e+09
# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles.
# TYPE go_gc_duration_seconds summary
# HELP go_goroutines Number of goroutines that currently exist.
Expand Down

0 comments on commit 16d33eb

Please sign in to comment.