Skip to content

Commit

Permalink
feat: Test new metric for GPU jobid map
Browse files Browse the repository at this point in the history
* Use a flag type of metric that flags to value 1 with jobid and GPU index as labels

* This increases cardinality but it helps us to get aggregate metrics

Signed-off-by: Mahendra Paipuri <[email protected]>
  • Loading branch information
mahendrapaipuri committed Jan 4, 2024
1 parent d8d5503 commit ccb134a
Show file tree
Hide file tree
Showing 6 changed files with 41 additions and 15 deletions.
4 changes: 4 additions & 0 deletions pkg/collector/fixtures/output/e2e-test-cgroupsv1-output.txt
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,10 @@ batchjob_slurm_job_memory_used_bytes{batch="slurm",hostname="",jobaccount="testa
# TYPE batchjob_slurm_job_nvidia_gpu_jobid gauge
batchjob_slurm_job_nvidia_gpu_jobid{UUID="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3",batch="slurm",hostname="",index="3",uuid="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3"} 1.009248e+06
batchjob_slurm_job_nvidia_gpu_jobid{UUID="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3",batch="slurm",hostname="",index="2",uuid="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3"} 1.009248e+06
# HELP batchjob_slurm_job_nvidia_gpu_jobid_flag Indicates running job on GPU, 1=job running
# TYPE batchjob_slurm_job_nvidia_gpu_jobid_flag gauge
batchjob_slurm_job_nvidia_gpu_jobid_flag{UUID="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3",batch="slurm",hostname="",index="3",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",uuid="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3"} 1
batchjob_slurm_job_nvidia_gpu_jobid_flag{UUID="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3",batch="slurm",hostname="",index="2",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",uuid="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3"} 1
# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles.
# TYPE go_gc_duration_seconds summary
# HELP go_goroutines Number of goroutines that currently exist.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ batchjob_slurm_job_memsw_used_bytes{batch="slurm",hostname="",jobaccount="testac
# TYPE batchjob_slurm_job_nvidia_gpu_jobid gauge
batchjob_slurm_job_nvidia_gpu_jobid{UUID="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3",batch="slurm",hostname="",index="3",uuid="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3"} 1.009248e+06
batchjob_slurm_job_nvidia_gpu_jobid{UUID="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3",batch="slurm",hostname="",index="2",uuid="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3"} 1.009248e+06
# HELP batchjob_slurm_job_nvidia_gpu_jobid_flag Indicates running job on GPU, 1=job running
# TYPE batchjob_slurm_job_nvidia_gpu_jobid_flag gauge
batchjob_slurm_job_nvidia_gpu_jobid_flag{UUID="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3",batch="slurm",hostname="",index="3",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",uuid="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3"} 1
batchjob_slurm_job_nvidia_gpu_jobid_flag{UUID="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3",batch="slurm",hostname="",index="2",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",uuid="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3"} 1
# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles.
# TYPE go_gc_duration_seconds summary
# HELP go_goroutines Number of goroutines that currently exist.
Expand Down
4 changes: 4 additions & 0 deletions pkg/collector/fixtures/output/e2e-test-cgroupsv2-output.txt
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,10 @@ batchjob_slurm_job_memory_used_bytes{batch="slurm",hostname="",jobaccount="testa
# TYPE batchjob_slurm_job_nvidia_gpu_jobid gauge
batchjob_slurm_job_nvidia_gpu_jobid{UUID="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3",batch="slurm",hostname="",index="3",uuid="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3"} 1.009248e+06
batchjob_slurm_job_nvidia_gpu_jobid{UUID="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3",batch="slurm",hostname="",index="2",uuid="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3"} 1.009248e+06
# HELP batchjob_slurm_job_nvidia_gpu_jobid_flag Indicates running job on GPU, 1=job running
# TYPE batchjob_slurm_job_nvidia_gpu_jobid_flag gauge
batchjob_slurm_job_nvidia_gpu_jobid_flag{UUID="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3",batch="slurm",hostname="",index="3",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",uuid="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3"} 1
batchjob_slurm_job_nvidia_gpu_jobid_flag{UUID="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3",batch="slurm",hostname="",index="2",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",uuid="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3"} 1
# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles.
# TYPE go_gc_duration_seconds summary
# HELP go_goroutines Number of goroutines that currently exist.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,10 @@ batchjob_slurm_job_memory_used_bytes{batch="slurm",hostname="",jobaccount="testa
# TYPE batchjob_slurm_job_nvidia_gpu_jobid gauge
batchjob_slurm_job_nvidia_gpu_jobid{UUID="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3",batch="slurm",hostname="",index="3",uuid="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3"} 1.009248e+06
batchjob_slurm_job_nvidia_gpu_jobid{UUID="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3",batch="slurm",hostname="",index="2",uuid="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3"} 1.009248e+06
# HELP batchjob_slurm_job_nvidia_gpu_jobid_flag Indicates running job on GPU, 1=job running
# TYPE batchjob_slurm_job_nvidia_gpu_jobid_flag gauge
batchjob_slurm_job_nvidia_gpu_jobid_flag{UUID="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3",batch="slurm",hostname="",index="3",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",uuid="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3"} 1
batchjob_slurm_job_nvidia_gpu_jobid_flag{UUID="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3",batch="slurm",hostname="",index="2",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",uuid="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3"} 1
# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles.
# TYPE go_gc_duration_seconds summary
# HELP go_goroutines Number of goroutines that currently exist.
Expand Down
32 changes: 21 additions & 11 deletions pkg/collector/slurm.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ type slurmCollector struct {
cgroupsRootPath string
slurmCgroupsPath string
hostname string
nvidiaGPUDevs map[int]Device
gpuDevs map[int]Device
cpuUser *prometheus.Desc
cpuSystem *prometheus.Desc
cpuTotal *prometheus.Desc
Expand All @@ -115,6 +115,7 @@ type slurmCollector struct {
memswFailCount *prometheus.Desc
memoryPressure *prometheus.Desc
gpuJobMap *prometheus.Desc
gpuJobFlag *prometheus.Desc
collectError *prometheus.Desc
logger log.Logger
}
Expand Down Expand Up @@ -163,17 +164,20 @@ func NewSlurmCollector(logger log.Logger) (Collector, error) {
}
}

// Attempt to get nVIDIA GPU devices
nvidiaGPUDevs, err := GetNvidiaGPUDevices(*nvidiaSmiPath, logger)
if err == nil {
level.Info(logger).Log("msg", "nVIDIA GPU devices found")
// Attempt to get GPU devices
var gpuDevs map[int]Device
if _, err := os.Stat(*nvidiaSmiPath); err == nil {
gpuDevs, err = GetNvidiaGPUDevices(*nvidiaSmiPath, logger)
if err == nil {
level.Info(logger).Log("msg", "nVIDIA GPU devices found")
}
}
return &slurmCollector{
cgroups: cgroupsVersion,
cgroupsRootPath: cgroupsRootPath,
slurmCgroupsPath: slurmCgroupsPath,
hostname: hostname,
nvidiaGPUDevs: nvidiaGPUDevs,
gpuDevs: gpuDevs,
cpuUser: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "cpu_user_seconds"),
"Cumulative CPU user seconds",
Expand Down Expand Up @@ -264,6 +268,11 @@ func NewSlurmCollector(logger log.Logger) (Collector, error) {
[]string{"batch", "hostname", "jobid", "jobaccount", "jobuuid", "step", "task"},
nil,
),
gpuJobFlag: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "nvidia_gpu_jobid_flag"),
"Indicates running job on GPU, 1=job running",
[]string{"batch", "hostname", "jobid", "jobaccount", "jobuuid", "index", "uuid", "UUID"}, nil,
),
gpuJobMap: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "nvidia_gpu_jobid"),
"Batch Job ID of current nVIDIA GPU",
Expand Down Expand Up @@ -327,13 +336,14 @@ func (c *slurmCollector) Update(ch chan<- prometheus.Metric) error {
for _, gpuOrdinal := range m.jobGpuOrdinals {
var uuid string
// Check the int index of devices where gpuOrdinal == dev.index
for _, dev := range c.nvidiaGPUDevs {
for _, dev := range c.gpuDevs {
if gpuOrdinal == dev.index {
uuid = dev.uuid
break
}
}
ch <- prometheus.MustNewConstMetric(c.gpuJobMap, prometheus.GaugeValue, float64(jid), m.batch, c.hostname, gpuOrdinal, uuid, uuid)
ch <- prometheus.MustNewConstMetric(c.gpuJobFlag, prometheus.GaugeValue, float64(1), m.batch, c.hostname, m.jobid, m.jobaccount, m.jobuuid, gpuOrdinal, uuid, uuid)
}
}
return nil
Expand Down Expand Up @@ -486,8 +496,8 @@ func (c *slurmCollector) getJobProperties(metric *CgroupMetric, pids []uint64) {
// it but just to be safe. This will have a small overhead as we need to check the
// correct integer index for each device index. We can live with it as there are
// typically 2/4/8 GPUs per node.
for i := 0; i < len(c.nvidiaGPUDevs); i++ {
dev := c.nvidiaGPUDevs[i]
for i := 0; i < len(c.gpuDevs); i++ {
dev := c.gpuDevs[i]
gpuJobMapInfo := fmt.Sprintf("%s/%s", *gpuStatPath, dev.index)

// NOTE: Look for file name with UUID as it will be more appropriate with
Expand All @@ -512,7 +522,7 @@ func (c *slurmCollector) getJobProperties(metric *CgroupMetric, pids []uint64) {
// If we fail to get any of the job properties or if there are atleast one GPU devices
// and if we fail to get gpu ordinals for that job, try to get these properties
// by looking into environment variables
if jobUid == "" || jobAccount == "" || jobNodelist == "" || (len(jobGpuOrdinals) == 0 && len(c.nvidiaGPUDevs) > 0) {
if jobUid == "" || jobAccount == "" || jobNodelist == "" || (len(jobGpuOrdinals) == 0 && len(c.gpuDevs) > 0) {
// Attempt to get UID, Account, Nodelist from /proc file system by looking into
// environ for the process that has same SLURM_JOB_ID
//
Expand Down Expand Up @@ -594,7 +604,7 @@ outside:
Log("msg", "Failed to get job properties", "jobid", jobid)
}
// Emit warning when there are GPUs but no job to GPU map found
if len(c.nvidiaGPUDevs) > 0 && len(jobGpuOrdinals) == 0 {
if len(c.gpuDevs) > 0 && len(jobGpuOrdinals) == 0 {
level.Warn(c.logger).
Log("msg", "Failed to get GPU ordinals for job", "jobid", jobid)
}
Expand Down
8 changes: 4 additions & 4 deletions pkg/collector/slurm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ func TestCgroupsV2SlurmJobMetrics(t *testing.T) {
}
c := slurmCollector{
cgroups: "v2",
nvidiaGPUDevs: mockGPUDevices(),
gpuDevs: mockGPUDevices(),
cgroupsRootPath: *cgroupfsPath,
slurmCgroupsPath: fmt.Sprintf("%s/system.slice/slurmstepd.scope", *cgroupfsPath),
logger: log.NewNopLogger(),
Expand Down Expand Up @@ -90,7 +90,7 @@ func TestCgroupsV2SlurmJobMetricsWithProcFs(t *testing.T) {
c := slurmCollector{
cgroups: "v2",
cgroupsRootPath: *cgroupfsPath,
nvidiaGPUDevs: mockGPUDevices(),
gpuDevs: mockGPUDevices(),
slurmCgroupsPath: fmt.Sprintf("%s/system.slice/slurmstepd.scope", *cgroupfsPath),
logger: log.NewNopLogger(),
}
Expand Down Expand Up @@ -142,7 +142,7 @@ func TestCgroupsV2SlurmJobMetricsNoJobProps(t *testing.T) {
c := slurmCollector{
cgroups: "v2",
cgroupsRootPath: *cgroupfsPath,
nvidiaGPUDevs: mockGPUDevices(),
gpuDevs: mockGPUDevices(),
slurmCgroupsPath: fmt.Sprintf("%s/system.slice/slurmstepd.scope", *cgroupfsPath),
logger: log.NewNopLogger(),
}
Expand Down Expand Up @@ -195,7 +195,7 @@ func TestCgroupsV1SlurmJobMetrics(t *testing.T) {
c := slurmCollector{
cgroups: "v1",
logger: log.NewNopLogger(),
nvidiaGPUDevs: mockGPUDevices(),
gpuDevs: mockGPUDevices(),
cgroupsRootPath: fmt.Sprintf("%s/cpuacct", *cgroupfsPath),
slurmCgroupsPath: fmt.Sprintf("%s/cpuacct/slurm", *cgroupfsPath),
}
Expand Down

0 comments on commit ccb134a

Please sign in to comment.