Skip to content

Commit

Permalink
replace nvml with calling and parsing nvidia-smi
Browse files Browse the repository at this point in the history
  • Loading branch information
nickpetrovic committed Jan 19, 2024
1 parent a21bdd1 commit c3bb444
Show file tree
Hide file tree
Showing 8 changed files with 45 additions and 81 deletions.
1 change: 0 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ module github.com/beam-cloud/beam
go 1.21

require (
github.com/NVIDIA/go-nvml v0.12.0-1
github.com/alicebob/miniredis/v2 v2.30.5
github.com/asecurityteam/rolling v0.0.0-20230418204413-b4052899307d
github.com/aws/aws-sdk-go-v2 v1.24.0
Expand Down
2 changes: 0 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@ github.com/ClickHouse/clickhouse-go/v2 v2.16.0 h1:rhMfnPewXPnY4Q4lQRGdYuTLRBRKJE
github.com/ClickHouse/clickhouse-go/v2 v2.16.0/go.mod h1:J7SPfIxwR+x4mQ+o8MLSe0oY50NNntEqCIjFe/T1VPM=
github.com/Microsoft/go-winio v0.6.1 h1:9/kr64B9VUZrLm5YYwbGtUJnMgqWVOdUAXu6Migciow=
github.com/Microsoft/go-winio v0.6.1/go.mod h1:LRdKpFKfdobln8UmuiYcKPot9D2v6svN5+sAH+4kjUM=
github.com/NVIDIA/go-nvml v0.12.0-1 h1:6mdjtlFo+17dWL7VFPfuRMtf0061TF4DKls9pkSw6uM=
github.com/NVIDIA/go-nvml v0.12.0-1/go.mod h1:hy7HYeQy335x6nEss0Ne3PYqleRa6Ct+VKD9RQ4nyFs=
github.com/Nvveen/Gotty v0.0.0-20120604004816-cd527374f1e5 h1:TngWCqHvy9oXAN6lEVMRuU21PR1EtLVZJmdB18Gu3Rw=
github.com/Nvveen/Gotty v0.0.0-20120604004816-cd527374f1e5/go.mod h1:lmUJ/7eu/Q8D7ML55dXQrVaamCz2vxCfdQBasLZfHKk=
github.com/alicebob/gopher-json v0.0.0-20200520072559-a9ecdc9d1d3a h1:HbKu58rmZpUGpz5+4FfNmIU+FmZg2P3Xaj2v2bfNWmk=
Expand Down
8 changes: 0 additions & 8 deletions internal/worker/metrics_darwin.go

This file was deleted.

23 changes: 0 additions & 23 deletions internal/worker/metrics_linux.go

This file was deleted.

45 changes: 45 additions & 0 deletions internal/worker/procutil.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
package worker

import (
"bufio"
"errors"
"fmt"
"os/exec"
"runtime"
"strconv"
"strings"

"github.com/prometheus/procfs"
)
Expand All @@ -11,6 +17,45 @@ type GpuMemoryUsageStats struct {
TotalCapacity int64
}

// GetGpuMemoryUsage retrieves the memory usage of a specific NVIDIA GPU.
// It returns the total and used memory in bytes.
func GetGpuMemoryUsage(deviceIndex int) (GpuMemoryUsageStats, error) {
stats := GpuMemoryUsageStats{}

command := "nvidia-smi"
commandArgs := []string{"--query-gpu=memory.total,memory.used", "--format=csv,noheader,nounits", fmt.Sprintf("--id=%d", deviceIndex)}

out, err := exec.Command(command, commandArgs...).Output()
if err != nil {
return stats, fmt.Errorf("unable to invoke nvidia-smi: %v", err)
}

scanner := bufio.NewScanner(strings.NewReader(string(out)))
if scanner.Scan() {
line := scanner.Text()
fields := strings.Split(line, ",")

if len(fields) != 2 {
return stats, errors.New("unable to parse gpu memory info")
}

total, err := strconv.ParseInt(strings.Trim(fields[0], " "), 10, 64)
if err != nil {
return stats, fmt.Errorf("unable to parse total gpu memory: %v", err)
}

used, err := strconv.ParseInt(strings.Trim(fields[1], " "), 10, 64)
if err != nil {
return stats, fmt.Errorf("unable to parse used gpu memory: %v", err)
}

stats.TotalCapacity = total * 1024 * 1024
stats.UsedCapacity = used * 1024 * 1024
}

return stats, nil
}

func GetSystemCPU() (float64, error) {
fs, err := procfs.NewFS("/proc")
if err != nil {
Expand Down
8 changes: 0 additions & 8 deletions internal/worker/procutil_darwin.go

This file was deleted.

37 changes: 0 additions & 37 deletions internal/worker/procutil_linux.go

This file was deleted.

2 changes: 0 additions & 2 deletions internal/worker/worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,6 @@ func NewWorker() (*Worker, error) {
statsdRepo := repo.NewMetricsStatsdRepository()

workerMetrics := NewWorkerMetrics(ctx, podHostName, statsdRepo, workerRepo, repo.NewMetricsStreamRepository(ctx, config.Metrics))
workerMetrics.InitNvml()

return &Worker{
ctx: ctx,
Expand Down Expand Up @@ -722,7 +721,6 @@ func (s *Worker) shutdown() error {
return err
}

s.workerMetrics.Shutdown()
s.cancel()
return nil
}

0 comments on commit c3bb444

Please sign in to comment.