Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prioritize SLURM_JOB_GPUS env for GPU mapping #221

Merged
merged 1 commit into from
Nov 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pkg/collector/libvirt.go
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ func NewLibvirtCollector(logger *slog.Logger) (Collector, error) {
for _, gpuType := range gpuTypes {
gpuDevs, err = GetGPUDevices(gpuType, logger)
if err == nil {
logger.Info("gpu: " + gpuType)
logger.Info("GPU devices found", "type", gpuType, "num_devs", len(gpuDevs))

break
}
Expand Down
38 changes: 26 additions & 12 deletions pkg/collector/slurm.go
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ func NewSlurmCollector(logger *slog.Logger) (Collector, error) {
for _, gpuType := range gpuTypes {
gpuDevs, err = GetGPUDevices(gpuType, logger)
if err == nil {
logger.Info("gpu: " + gpuType)
logger.Info("GPU devices found", "type", gpuType, "num_devs", len(gpuDevs))

break
}
Expand Down Expand Up @@ -515,7 +515,7 @@ func readProcEnvirons(data interface{}) error {
return errors.New("data type cannot be asserted")
}

var gpuOrdinals []string
var stepGPUs, jobGPUs []string

// Env var that we will search
jobIDEnv := "SLURM_JOB_ID=" + d.uuid
Expand All @@ -528,6 +528,11 @@ func readProcEnvirons(data interface{}) error {
// have capabilities to read environment variables. So, we just do
// old school loop on procs and attempt to find target env variables.
for _, proc := range d.procs {
// If SLURM_JOB_GPUS env var is found, exit loop
if len(jobGPUs) > 0 {
break
}

// Read process environment variables
// NOTE: This needs CAP_SYS_PTRACE and CAP_DAC_READ_SEARCH caps
// on the current process
Expand All @@ -538,22 +543,31 @@ func readProcEnvirons(data interface{}) error {
}

// When env var entry found, get all necessary env vars
// NOTE: This is not really concurrent safe. Multiple go routines might
// overwrite the variables. But I think we can live with it as for a gievn
// job cgroup these env vars should be identical in all procs
for _, env := range environments {
if strings.Contains(env, "SLURM_STEP_GPUS") || strings.Contains(env, "SLURM_JOB_GPUS") {
gpuOrdinals = strings.Split(strings.Split(env, "=")[1], ",")
if strings.Contains(env, "SLURM_STEP_GPUS") {
stepGPUs = strings.Split(strings.Split(env, "=")[1], ",")
}

goto outside
if strings.Contains(env, "SLURM_JOB_GPUS") {
jobGPUs = strings.Split(strings.Split(env, "=")[1], ",")
}
}
}

outside:

// Set found gpuOrdinals on ctxData
d.gpuOrdinals = gpuOrdinals
// If both SLURM_STEP_GPUS and SLURM_JOB_GPUS are found, proritize
// SLURM_JOB_GPUS. We noticed that when both env vars are found,
// SLURM_STEP_GPUS is not correctly set.
// Technically SLURM_JOB_GPUS should be set for jobs and SLURM_STEP_GPUS
// should be set for steps like `srun`. When they are both set
// SLURM_STEP_GPUS should be a subset of SLURM_JOB_GPUS but this is not
// the case eversince we migrated to SLURM 23.11 on JZ. Maybe it is a
// side effect of Atos' patches?
// Relevant SLURM src: https://github.com/SchedMD/slurm/blob/d3e78848f72745ceb80e2a6bebdbcf3cfd7462b1/src/plugins/gres/common/gres_common.c#L262-L265
if len(jobGPUs) > 0 {
d.gpuOrdinals = jobGPUs
} else if len(stepGPUs) > 0 {
d.gpuOrdinals = stepGPUs
}

return nil
}
4 changes: 2 additions & 2 deletions pkg/collector/testdata/proc.ttar
Original file line number Diff line number Diff line change
Expand Up @@ -2295,7 +2295,7 @@ SymlinkTo: /usr/bin
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: proc/3346567/environ
Lines: 1
PATH=/go/bin:/usr/local/go/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/binNULLBYTEHOSTNAME=cd24e11f73a5NULLBYTETERM=xtermNULLBYTEGOLANG_VERSION=1.12.5NULLBYTEGOPATH=/goNULLBYTEHOME=/rootNULLBYTESLURM_JOB_UID=1000NULLBYTESLURM_JOB_ID=1009248NULLBYTESLURM_JOB_ACCOUNT=testaccNULLBYTESLURM_JOB_NODELIST=compute-[0-2]NULLBYTESLURM_JOB_GPUS=2,3NULLBYTESLURM_JOB_USER=testusrNULLBYTE
PATH=/go/bin:/usr/local/go/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/binNULLBYTEHOSTNAME=cd24e11f73a5NULLBYTETERM=xtermNULLBYTEGOLANG_VERSION=1.12.5NULLBYTEGOPATH=/goNULLBYTEHOME=/rootNULLBYTESLURM_JOB_UID=1000NULLBYTESLURM_JOB_ID=1009248NULLBYTESLURM_JOB_ACCOUNT=testaccNULLBYTESLURM_JOB_NODELIST=compute-[0-2]NULLBYTESLURM_JOB_GPUS=2,3NULLBYTESLURM_JOB_USER=testusrNULLBYTESLURM_STEP_GPUS=1,4NULLBYTE
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: proc/3346567/exe
Expand Down Expand Up @@ -2980,7 +2980,7 @@ SymlinkTo: /usr/bin
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: proc/3346596/environ
Lines: 1
PATH=/go/bin:/usr/local/go/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/binNULLBYTEHOSTNAME=cd24e11f73a5NULLBYTETERM=xtermNULLBYTEGOLANG_VERSION=1.12.5NULLBYTEGOPATH=/goNULLBYTEHOME=/rootNULLBYTESLURM_JOB_UID=1000NULLBYTESLURM_JOB_ID=1009249NULLBYTESLURM_JOB_ACCOUNT=testacc2NULLBYTESLURM_JOB_NODELIST=compute-[0-2]NULLBYTESLURM_JOB_GPUS=0NULLBYTESLURM_JOB_USER=testusr2NULLBYTE
PATH=/go/bin:/usr/local/go/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/binNULLBYTEHOSTNAME=cd24e11f73a5NULLBYTETERM=xtermNULLBYTEGOLANG_VERSION=1.12.5NULLBYTEGOPATH=/goNULLBYTEHOME=/rootNULLBYTESLURM_JOB_UID=1000NULLBYTESLURM_JOB_ID=1009249NULLBYTESLURM_JOB_ACCOUNT=testacc2NULLBYTESLURM_JOB_NODELIST=compute-[0-2]NULLBYTESLURM_JOB_GPUS=0NULLBYTESLURM_JOB_USER=testusr2NULLBYTESLURM_STEP_GPUS=1NULLBYTE
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: proc/3346596/exe
Expand Down