Skip to content

Commit

Permalink
fix: Correct max index while looping over devs
Browse files Browse the repository at this point in the history
* Ensure we trim spaces in nvidia-smi output

Signed-off-by: Mahendra Paipuri <[email protected]>
  • Loading branch information
mahendrapaipuri committed Jan 2, 2024
1 parent d7facbe commit c41e534
Show file tree
Hide file tree
Showing 3 changed files with 3 additions and 3 deletions.
2 changes: 1 addition & 1 deletion pkg/collector/fixtures/nvidia-smi
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

echo """index, name, uuid
printf """index, name, uuid
0, Tesla V100-SXM2-32GB, GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e
1, Tesla V100-SXM2-32GB, GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3
2, Tesla V100-SXM2-32GB, GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3
Expand Down
2 changes: 1 addition & 1 deletion pkg/collector/helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ func GetNvidiaGPUDevices(nvidiaSmiPath string, logger log.Logger) (map[int]Devic
// Get all devices
gpuDevices := map[int]Device{}
devIndxInt := 0
for _, line := range strings.Split(string(nvidiaSmiOutput), "\n") {
for _, line := range strings.Split(strings.TrimSpace(string(nvidiaSmiOutput)), "\n") {
// Header line, empty line and newlines are ignored
if line == "" || line == "\n" || strings.HasPrefix(line, "index") {
continue
Expand Down
2 changes: 1 addition & 1 deletion pkg/collector/slurm.go
Original file line number Diff line number Diff line change
Expand Up @@ -486,7 +486,7 @@ func (c *slurmCollector) getJobProperties(metric *CgroupMetric, pids []uint64) {
// it but just to be safe. This will have a small overhead as we need to check the
// correct integer index for each device index. We can live with it as there are
// typically 2/4/8 GPUs per node.
for i := 0; i <= len(c.nvidiaGPUDevs); i++ {
for i := 0; i < len(c.nvidiaGPUDevs); i++ {
dev := c.nvidiaGPUDevs[i]
gpuJobMapInfo := fmt.Sprintf("%s/%s", *gpuStatPath, dev.index)

Expand Down

0 comments on commit c41e534

Please sign in to comment.