From c3bb444a5a86b3c2ea69d6fcee7f0c7c55905c4c Mon Sep 17 00:00:00 2001 From: Nick Petrovic Date: Thu, 18 Jan 2024 21:50:51 -0500 Subject: [PATCH] replace nvml with calling and parsing nvidia-smi --- go.mod | 1 - go.sum | 2 -- internal/worker/metrics_darwin.go | 8 ------ internal/worker/metrics_linux.go | 23 --------------- internal/worker/procutil.go | 45 ++++++++++++++++++++++++++++++ internal/worker/procutil_darwin.go | 8 ------ internal/worker/procutil_linux.go | 37 ------------------------ internal/worker/worker.go | 2 -- 8 files changed, 45 insertions(+), 81 deletions(-) delete mode 100644 internal/worker/metrics_darwin.go delete mode 100644 internal/worker/metrics_linux.go delete mode 100644 internal/worker/procutil_darwin.go delete mode 100644 internal/worker/procutil_linux.go diff --git a/go.mod b/go.mod index 3755f91bc..3366719c0 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,6 @@ module github.com/beam-cloud/beam go 1.21 require ( - github.com/NVIDIA/go-nvml v0.12.0-1 github.com/alicebob/miniredis/v2 v2.30.5 github.com/asecurityteam/rolling v0.0.0-20230418204413-b4052899307d github.com/aws/aws-sdk-go-v2 v1.24.0 diff --git a/go.sum b/go.sum index 0341adcb1..7599ab692 100644 --- a/go.sum +++ b/go.sum @@ -11,8 +11,6 @@ github.com/ClickHouse/clickhouse-go/v2 v2.16.0 h1:rhMfnPewXPnY4Q4lQRGdYuTLRBRKJE github.com/ClickHouse/clickhouse-go/v2 v2.16.0/go.mod h1:J7SPfIxwR+x4mQ+o8MLSe0oY50NNntEqCIjFe/T1VPM= github.com/Microsoft/go-winio v0.6.1 h1:9/kr64B9VUZrLm5YYwbGtUJnMgqWVOdUAXu6Migciow= github.com/Microsoft/go-winio v0.6.1/go.mod h1:LRdKpFKfdobln8UmuiYcKPot9D2v6svN5+sAH+4kjUM= -github.com/NVIDIA/go-nvml v0.12.0-1 h1:6mdjtlFo+17dWL7VFPfuRMtf0061TF4DKls9pkSw6uM= -github.com/NVIDIA/go-nvml v0.12.0-1/go.mod h1:hy7HYeQy335x6nEss0Ne3PYqleRa6Ct+VKD9RQ4nyFs= github.com/Nvveen/Gotty v0.0.0-20120604004816-cd527374f1e5 h1:TngWCqHvy9oXAN6lEVMRuU21PR1EtLVZJmdB18Gu3Rw= github.com/Nvveen/Gotty v0.0.0-20120604004816-cd527374f1e5/go.mod h1:lmUJ/7eu/Q8D7ML55dXQrVaamCz2vxCfdQBasLZfHKk= github.com/alicebob/gopher-json v0.0.0-20200520072559-a9ecdc9d1d3a h1:HbKu58rmZpUGpz5+4FfNmIU+FmZg2P3Xaj2v2bfNWmk= diff --git a/internal/worker/metrics_darwin.go b/internal/worker/metrics_darwin.go deleted file mode 100644 index ee68444aa..000000000 --- a/internal/worker/metrics_darwin.go +++ /dev/null @@ -1,8 +0,0 @@ -//go:build darwin -// +build darwin - -package worker - -func (wm *WorkerMetrics) InitNvml() {} - -func (wm *WorkerMetrics) Shutdown() {} diff --git a/internal/worker/metrics_linux.go b/internal/worker/metrics_linux.go deleted file mode 100644 index cfbcd45f6..000000000 --- a/internal/worker/metrics_linux.go +++ /dev/null @@ -1,23 +0,0 @@ -//go:build linux -// +build linux - -package worker - -import ( - "github.com/NVIDIA/go-nvml/pkg/nvml" - "github.com/okteto/okteto/pkg/log" -) - -func (wm *WorkerMetrics) InitNvml() { - // TODO: investigate segmentation violation - // wm.nvmlActive = nvml.Init() == nvml.SUCCESS - wm.nvmlActive = false -} - -func (wm *WorkerMetrics) Shutdown() { - if wm.nvmlActive { - if ret := nvml.Shutdown(); ret != nvml.SUCCESS { - log.Printf("Failed to shutdown nvml: %v\n", ret) - } - } -} diff --git a/internal/worker/procutil.go b/internal/worker/procutil.go index afdbc68d8..d5dc81a3c 100644 --- a/internal/worker/procutil.go +++ b/internal/worker/procutil.go @@ -1,7 +1,13 @@ package worker import ( + "bufio" + "errors" + "fmt" + "os/exec" "runtime" + "strconv" + "strings" "github.com/prometheus/procfs" ) @@ -11,6 +17,45 @@ type GpuMemoryUsageStats struct { TotalCapacity int64 } +// GetGpuMemoryUsage retrieves the memory usage of a specific NVIDIA GPU. +// It returns the total and used memory in bytes. +func GetGpuMemoryUsage(deviceIndex int) (GpuMemoryUsageStats, error) { + stats := GpuMemoryUsageStats{} + + command := "nvidia-smi" + commandArgs := []string{"--query-gpu=memory.total,memory.used", "--format=csv,noheader,nounits", fmt.Sprintf("--id=%d", deviceIndex)} + + out, err := exec.Command(command, commandArgs...).Output() + if err != nil { + return stats, fmt.Errorf("unable to invoke nvidia-smi: %v", err) + } + + scanner := bufio.NewScanner(strings.NewReader(string(out))) + if scanner.Scan() { + line := scanner.Text() + fields := strings.Split(line, ",") + + if len(fields) != 2 { + return stats, errors.New("unable to parse gpu memory info") + } + + total, err := strconv.ParseInt(strings.Trim(fields[0], " "), 10, 64) + if err != nil { + return stats, fmt.Errorf("unable to parse total gpu memory: %v", err) + } + + used, err := strconv.ParseInt(strings.Trim(fields[1], " "), 10, 64) + if err != nil { + return stats, fmt.Errorf("unable to parse used gpu memory: %v", err) + } + + stats.TotalCapacity = total * 1024 * 1024 + stats.UsedCapacity = used * 1024 * 1024 + } + + return stats, nil +} + func GetSystemCPU() (float64, error) { fs, err := procfs.NewFS("/proc") if err != nil { diff --git a/internal/worker/procutil_darwin.go b/internal/worker/procutil_darwin.go deleted file mode 100644 index bb0ea2062..000000000 --- a/internal/worker/procutil_darwin.go +++ /dev/null @@ -1,8 +0,0 @@ -//go:build darwin -// +build darwin - -package worker - -func GetGpuMemoryUsage(deviceId int) (GpuMemoryUsageStats, error) { - return GpuMemoryUsageStats{0, 0}, nil -} diff --git a/internal/worker/procutil_linux.go b/internal/worker/procutil_linux.go deleted file mode 100644 index ff98ecc3b..000000000 --- a/internal/worker/procutil_linux.go +++ /dev/null @@ -1,37 +0,0 @@ -//go:build linux -// +build linux - -package worker - -import ( - "errors" - "strconv" - - "github.com/NVIDIA/go-nvml/pkg/nvml" - "github.com/okteto/okteto/pkg/log" -) - -func GetGpuMemoryUsage(deviceId int) (GpuMemoryUsageStats, error) { - stats := GpuMemoryUsageStats{} - - device, ret := nvml.DeviceGetHandleByIndex(deviceId) - if ret != nvml.SUCCESS { - log.Error("Unable to get device at index %d: %v", deviceId, nvml.ErrorString(ret)) - return stats, errors.New( - "Unable to get device at index " + strconv.Itoa(deviceId) + ": " + nvml.ErrorString(ret), - ) - } - - memory, ret := device.GetMemoryInfo() - if ret != nvml.SUCCESS { - log.Error("Unable to get memory info for device at index %d: %v", deviceId, nvml.ErrorString(ret)) - return stats, errors.New( - "Unable to get memory info for device at index " + strconv.Itoa(deviceId) + ": " + nvml.ErrorString(ret), - ) - } - - stats.UsedCapacity = int64(memory.Used) - stats.TotalCapacity = int64(memory.Total) - - return stats, nil -} diff --git a/internal/worker/worker.go b/internal/worker/worker.go index 98b529a9f..ed68832e4 100644 --- a/internal/worker/worker.go +++ b/internal/worker/worker.go @@ -143,7 +143,6 @@ func NewWorker() (*Worker, error) { statsdRepo := repo.NewMetricsStatsdRepository() workerMetrics := NewWorkerMetrics(ctx, podHostName, statsdRepo, workerRepo, repo.NewMetricsStreamRepository(ctx, config.Metrics)) - workerMetrics.InitNvml() return &Worker{ ctx: ctx, @@ -722,7 +721,6 @@ func (s *Worker) shutdown() error { return err } - s.workerMetrics.Shutdown() s.cancel() return nil }