From 31479b35f45912e4fb3115d6c3f75dea15fef483 Mon Sep 17 00:00:00 2001 From: Yash Anand Date: Thu, 12 Sep 2024 08:24:07 -0400 Subject: [PATCH] Fix GPU checkpoint --- Makefile | 2 +- docker/Dockerfile.worker | 2 +- go.mod | 14 +++++++------- go.sum | 28 ++++++++++++++-------------- pkg/types/config.go | 2 +- pkg/worker/cedana.go | 14 +++++++------- pkg/worker/nvidia.go | 6 ++++-- pkg/worker/worker.go | 5 +++-- 8 files changed, 38 insertions(+), 35 deletions(-) diff --git a/Makefile b/Makefile index dfeecb1bd..bbf958407 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ SHELL := /bin/bash tag := latest workerTag := latest runnerTag := latest -cedanaTag := 0.9.220 +cedanaTag := 0.9.222 setup: bash bin/setup.sh diff --git a/docker/Dockerfile.worker b/docker/Dockerfile.worker index cd8e5226d..cbf3c23f7 100644 --- a/docker/Dockerfile.worker +++ b/docker/Dockerfile.worker @@ -72,7 +72,7 @@ RUN go build -o /usr/local/bin/worker ./cmd/worker/main.go # final image # ======================== -FROM nvidia/cuda:12.3.1-base-ubuntu20.04 AS release +FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu20.04 AS release FROM release AS dev FROM ${BASE_STAGE} AS final diff --git a/go.mod b/go.mod index 6e4a358c0..2ab9ccfcf 100644 --- a/go.mod +++ b/go.mod @@ -19,7 +19,7 @@ require ( github.com/beam-cloud/clip v0.0.0-20240826223025-899feb184e88 github.com/beam-cloud/go-runc v0.0.0-20231222221338-b89899f33170 github.com/bsm/redislock v0.9.4 - github.com/cedana/cedana v0.9.219 + github.com/cedana/cedana v0.9.222 github.com/cenkalti/backoff v2.2.1+incompatible github.com/cloudevents/sdk-go/v2 v2.15.1 github.com/coreos/go-iptables v0.7.1-0.20240112124308-65c67c9f46e6 @@ -47,7 +47,7 @@ require ( github.com/prometheus/client_golang v1.19.1 github.com/prometheus/procfs v0.13.0 github.com/redis/go-redis/v9 v9.5.1 - github.com/rs/zerolog v1.32.0 + github.com/rs/zerolog v1.33.0 github.com/shirou/gopsutil/v4 v4.24.6 github.com/sirupsen/logrus v1.9.3 github.com/stretchr/testify v1.9.0 @@ -104,7 +104,7 @@ require ( github.com/dgraph-io/ristretto v0.1.1 // indirect github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect github.com/digitalocean/go-smbios v0.0.0-20180907143718-390a4f403a8e // indirect - github.com/docker/docker v26.1.4+incompatible // indirect + github.com/docker/docker v27.2.0+incompatible // indirect github.com/docker/go-units v0.5.0 // indirect github.com/dsnet/compress v0.0.2-0.20210315054119-f66993602bf5 // indirect github.com/dustin/go-humanize v1.0.1 // indirect @@ -167,7 +167,7 @@ require ( github.com/mitchellh/go-ps v1.0.0 // indirect github.com/mitchellh/mapstructure v1.5.0 // indirect github.com/mitchellh/reflectwalk v1.0.2 // indirect - github.com/moby/sys/user v0.1.0 // indirect + github.com/moby/sys/user v0.3.0 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826 // indirect @@ -223,15 +223,15 @@ require ( go4.org/netipx v0.0.0-20231129151722-fdeea329fbba // indirect golang.org/x/crypto v0.26.0 // indirect golang.org/x/mod v0.19.0 // indirect - golang.org/x/oauth2 v0.21.0 // indirect + golang.org/x/oauth2 v0.22.0 // indirect golang.org/x/sync v0.8.0 // indirect golang.org/x/term v0.23.0 // indirect golang.org/x/text v0.17.0 // indirect - golang.org/x/time v0.5.0 // indirect + golang.org/x/time v0.6.0 // indirect golang.org/x/tools v0.23.0 // indirect golang.zx2c4.com/wintun v0.0.0-20230126152724-0fa3db229ce2 // indirect golang.zx2c4.com/wireguard/windows v0.5.3 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20240812133136-8ffd90a71988 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20240822170219-fc7c04adadcd // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/natefinch/lumberjack.v2 v2.2.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect diff --git a/go.sum b/go.sum index 1efc741e8..6bf8df7e6 100644 --- a/go.sum +++ b/go.sum @@ -99,8 +99,8 @@ github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA= github.com/bsm/gomega v1.27.10/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0H+O0= github.com/bsm/redislock v0.9.4 h1:X/Wse1DPpiQgHbVYRE9zv6m070UcKoOGekgvpNhiSvw= github.com/bsm/redislock v0.9.4/go.mod h1:Epf7AJLiSFwLCiZcfi6pWFO/8eAYrYpQXFxEDPoDeAk= -github.com/cedana/cedana v0.9.219 h1:YkfEbNjhXYDRfo5eOk0XwDSdaSNDxb9DIIkSx4YQEyE= -github.com/cedana/cedana v0.9.219/go.mod h1:20+3e/l39akJFOmjHXxJeRIvpBIfeWbKdNt1J9wSh+A= +github.com/cedana/cedana v0.9.222 h1:ZIUuHWP3hRxrkPE/p108SJ079kdLNbBXx+CMV3+ywsU= +github.com/cedana/cedana v0.9.222/go.mod h1:IQDFr9/H9Opl2ym3xQoFoqLCO/KzebVMWkO7Mta3egk= github.com/cenkalti/backoff v2.2.1+incompatible h1:tNowT99t7UNflLxfYYSlKYsBpXdEet03Pg2g16Swow4= github.com/cenkalti/backoff v2.2.1+incompatible/go.mod h1:90ReRw6GdpyfrHakVjL/QHaoyV4aDUVVkXQJJJ3NXXM= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= @@ -149,8 +149,8 @@ github.com/digitalocean/go-smbios v0.0.0-20180907143718-390a4f403a8e h1:vUmf0yez github.com/digitalocean/go-smbios v0.0.0-20180907143718-390a4f403a8e/go.mod h1:YTIHhz/QFSYnu/EhlF2SpU2Uk+32abacUYA5ZPljz1A= github.com/djherbis/times v1.6.0 h1:w2ctJ92J8fBvWPxugmXIv7Nz7Q3iDMKNx9v5ocVH20c= github.com/djherbis/times v1.6.0/go.mod h1:gOHeRAz2h+VJNZ5Gmc/o7iD9k4wW7NMVqieYCY99oc0= -github.com/docker/docker v26.1.4+incompatible h1:vuTpXDuoga+Z38m1OZHzl7NKisKWaWlhjQk7IDPSLsU= -github.com/docker/docker v26.1.4+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= +github.com/docker/docker v27.2.0+incompatible h1:Rk9nIVdfH3+Vz4cyI/uhbINhEZ/oLmc+CBXmH6fbNk4= +github.com/docker/docker v27.2.0+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= github.com/docker/go-units v0.4.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= @@ -407,8 +407,8 @@ github.com/mitchellh/reflectwalk v1.0.2/go.mod h1:mSTlrgnPZtwu0c4WaC2kGObEpuNDbx github.com/moby/sys/mountinfo v0.6.2/go.mod h1:IJb6JQeOklcdMU9F5xQ8ZALD+CUr5VlGpwtX+VE0rpI= github.com/moby/sys/mountinfo v0.7.1 h1:/tTvQaSJRr2FshkhXiIpux6fQ2Zvc4j7tAhMTStAG2g= github.com/moby/sys/mountinfo v0.7.1/go.mod h1:IJb6JQeOklcdMU9F5xQ8ZALD+CUr5VlGpwtX+VE0rpI= -github.com/moby/sys/user v0.1.0 h1:WmZ93f5Ux6het5iituh9x2zAG7NFY9Aqi49jjE1PaQg= -github.com/moby/sys/user v0.1.0/go.mod h1:fKJhFOnsCN6xZ5gSfbM6zaHGgDJMrqt9/reuj4T7MmU= +github.com/moby/sys/user v0.3.0 h1:9ni5DlcW5an3SvRSx4MouotOygvzaXbaSrc/wGDFWPo= +github.com/moby/sys/user v0.3.0/go.mod h1:bG+tYYYJgaMtRKgEmuueC0hJEAZWwtIbZTB+85uoHjs= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= @@ -495,8 +495,8 @@ github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99 github.com/rootless-containers/proto v0.1.0 h1:gS1JOMEtk1YDYHCzBAf/url+olMJbac7MTrgSeP6zh4= github.com/rootless-containers/proto v0.1.0/go.mod h1:vgkUFZbQd0gcE/K/ZwtE4MYjZPu0UNHLXIQxhyqAFh8= github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg= -github.com/rs/zerolog v1.32.0 h1:keLypqrlIjaFsbmJOBdB/qvyF8KEtCWHwobLp5l/mQ0= -github.com/rs/zerolog v1.32.0/go.mod h1:/7mN4D5sKwJLZQ2b/znpjC3/GQWY/xaDXUM0kKWRHss= +github.com/rs/zerolog v1.33.0 h1:1cU2KZkvPxNyfgEmhHAz/1A9Bz+llsdYzklWFzgp0r8= +github.com/rs/zerolog v1.33.0/go.mod h1:/7mN4D5sKwJLZQ2b/znpjC3/GQWY/xaDXUM0kKWRHss= github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= @@ -654,8 +654,8 @@ golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwY golang.org/x/net v0.28.0 h1:a9JDOJc5GMUJ0+UDqmLT86WiEy7iWyIhz8gz8E4e5hE= golang.org/x/net v0.28.0/go.mod h1:yqtgsTWOOnlGLG9GFRrK3++bGOUEkNBoHZc8MEDWPNg= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= -golang.org/x/oauth2 v0.21.0 h1:tsimM75w1tF/uws5rbeHzIWxEqElMehnc+iW793zsZs= -golang.org/x/oauth2 v0.21.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= +golang.org/x/oauth2 v0.22.0 h1:BzDx2FehcG7jJwgWLELCdmLuxk2i+x9UDpSiss2u0ZA= +golang.org/x/oauth2 v0.22.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -704,8 +704,8 @@ golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.17.0 h1:XtiM5bkSOt+ewxlOE/aE/AKEHibwj/6gvWMl9Rsh0Qc= golang.org/x/text v0.17.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= -golang.org/x/time v0.5.0 h1:o7cqy6amK/52YcAKIPlM3a+Fpj35zvRj2TP+e1xFSfk= -golang.org/x/time v0.5.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= +golang.org/x/time v0.6.0 h1:eTDhh4ZXt5Qf0augr54TN6suAUudPcawVZeIAPU7D4U= +golang.org/x/time v0.6.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= @@ -729,8 +729,8 @@ google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7 google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240812133136-8ffd90a71988 h1:V71AcdLZr2p8dC9dbOIMCpqi4EmRl8wUwnJzXXLmbmc= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240812133136-8ffd90a71988/go.mod h1:Ue6ibwXGpU+dqIcODieyLOcgj7z8+IcskoNIgZxtrFY= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240822170219-fc7c04adadcd h1:6TEm2ZxXoQmFWFlt1vNxvVOa1Q0dXFQD1m/rYjXmS0E= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240822170219-fc7c04adadcd/go.mod h1:UqMtugtsSgubUsoxbuAoiCXvqvErP7Gf0so0mK9tHxU= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= diff --git a/pkg/types/config.go b/pkg/types/config.go index 08fb938bf..7275cc7be 100644 --- a/pkg/types/config.go +++ b/pkg/types/config.go @@ -4,7 +4,7 @@ import ( "time" blobcache "github.com/beam-cloud/blobcache-v2/pkg" - cedana "github.com/cedana/cedana/types" + cedana "github.com/cedana/cedana/pkg/types" corev1 "k8s.io/api/core/v1" ) diff --git a/pkg/worker/cedana.go b/pkg/worker/cedana.go index 3377a863f..336bb98f7 100644 --- a/pkg/worker/cedana.go +++ b/pkg/worker/cedana.go @@ -5,7 +5,7 @@ import ( "fmt" "time" - api "github.com/cedana/cedana/api/services/task" + api "github.com/cedana/cedana/pkg/api/services/task" "google.golang.org/grpc" "google.golang.org/grpc/credentials/insecure" @@ -66,9 +66,9 @@ func (c *CedanaClient) Checkpoint(ctx context.Context, containerId string) error defer cancel() args := api.DumpArgs{ - Type: api.CRType_LOCAL, - JID: containerId, - TcpEstablished: true, + Type: api.CRType_LOCAL, + JID: containerId, + CriuOpts: &api.CriuOpts{TcpEstablished: true}, // Dump dir taken from config } _, err := c.service.Dump(ctx, &args) @@ -84,9 +84,9 @@ func (c *CedanaClient) Restore(ctx context.Context, containerId string) error { defer cancel() args := &api.RestoreArgs{ - Type: api.CRType_LOCAL, - JID: containerId, - TcpEstablished: true, + Type: api.CRType_LOCAL, + JID: containerId, + CriuOpts: &api.CriuOpts{TcpEstablished: true}, } _, err := c.service.Restore(ctx, args) // TODO gather metrics from response diff --git a/pkg/worker/nvidia.go b/pkg/worker/nvidia.go index 3e859f169..8ec30d672 100644 --- a/pkg/worker/nvidia.go +++ b/pkg/worker/nvidia.go @@ -13,7 +13,7 @@ import ( ) var ( - defaultContainerCudaVersion string = "12.3" + defaultContainerCudaVersion string = "12.4" defaultContainerPath []string = []string{"/usr/local/sbin", "/usr/local/bin", "/usr/sbin", "/usr/bin", "/sbin", "/bin"} defaultContainerLibrary []string = []string{"/usr/lib/x86_64-linux-gnu", "/usr/lib/worker/x86_64-linux-gnu", "/usr/local/nvidia/lib64"} ) @@ -270,7 +270,9 @@ func (c *ContainerNvidiaManager) InjectEnvVars(env []string, options *ContainerO } func (c *ContainerNvidiaManager) InjectMounts(mounts []specs.Mount) []specs.Mount { - cudaPaths := []string{fmt.Sprintf("/usr/local/cuda-%s", defaultContainerCudaVersion), "/usr/local/nvidia/lib64"} + // /usr/local/cuda already points to the installed CUDA lib in the worker + // XXX: Could remove hardcoded defaultContainerCudaVersion from this file + cudaPaths := []string{"/usr/local/cuda", "/usr/local/nvidia/lib64"} for _, path := range cudaPaths { if _, err := os.Stat(path); os.IsNotExist(err) { diff --git a/pkg/worker/worker.go b/pkg/worker/worker.go index 627f51c4a..b9b3a8533 100644 --- a/pkg/worker/worker.go +++ b/pkg/worker/worker.go @@ -854,10 +854,11 @@ func (s *Worker) specFromRequest(request *types.ContainerRequest, options *Conta return nil, fmt.Errorf("failed to parse cedana config: %v", err) } originalArgs := "\"" + strings.Join(spec.Process.Args, " ") + "\"" - // TODO: Detect and pass in --cuda flag - cedanaArgs := fmt.Sprintf("%s daemon start --gpu-enabled=%t --config='%s' & %s exec %s -w %s -i %s --attach", + cudaVersion := "12.4" // XXX: Should detect and modify if using custom image + cedanaArgs := fmt.Sprintf("%s daemon start --gpu-enabled=%t --cuda %s --config='%s' & %s exec %s -w %s -i %s --attach", CedanaPath, request.Gpu != "", + cudaVersion, configJSON, CedanaPath, originalArgs,