Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
yashanand1910 committed Oct 15, 2024
1 parent 85cfbac commit 5ae4f87
Show file tree
Hide file tree
Showing 7 changed files with 46 additions and 46 deletions.
6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ SHELL := /bin/bash
tag := latest
workerTag := latest
runnerTag := latest
cedanaTag := 0.9.222
cedanaTag := 0.9.227

setup:
bash bin/setup.sh
Expand Down Expand Up @@ -33,7 +33,7 @@ gateway:
docker push localhost:5001/beta9-gateway:$(tag)

worker:
docker build . --target final --build-arg BASE_STAGE=dev -f ./docker/Dockerfile.worker -t localhost:5001/beta9-worker:$(workerTag)
docker build . --build-arg CEDANA_VERSION=$(cedanaTag) --target final --build-arg BASE_STAGE=dev -f ./docker/Dockerfile.worker -t localhost:5001/beta9-worker:$(workerTag)
docker push localhost:5001/beta9-worker:$(workerTag)
bin/delete_workers.sh

Expand All @@ -43,7 +43,7 @@ proxy:

runner:
for target in py312 py311 py310 py39 py38; do \
docker build . --build-arg CEDANA_VERSION=$(cedanaTag) --no-cache --target $$target --platform=linux/amd64 -f ./docker/Dockerfile.runner -t localhost:5001/beta9-runner:$$target-$(runnerTag); \
docker build . --no-cache --target $$target --platform=linux/amd64 -f ./docker/Dockerfile.runner -t localhost:5001/beta9-runner:$$target-$(runnerTag); \
docker push localhost:5001/beta9-runner:$$target-$(runnerTag); \
done

Expand Down
18 changes: 0 additions & 18 deletions docker/Dockerfile.runner
Original file line number Diff line number Diff line change
Expand Up @@ -11,24 +11,6 @@ add-apt-repository ppa:deadsnakes/ppa
apt-get update
EOT

# XXX: Remove once cedana starts shipping with a compatible binary
RUN <<EOT
set -eux
apt-get install -y python3-protobuf libnet1 libnftables1 libnl-3-200 libprotobuf-c1 iptables
curl -L -o criu_3.19-4_amd64.deb https://download.opensuse.org/repositories/devel:/tools:/criu/xUbuntu_22.04/amd64/criu_3.19-4_amd64.deb
dpkg -i criu_3.19-4_amd64.deb
rm criu_3.19-4_amd64.deb
EOT

ARG CEDANA_VERSION=0.9.220
RUN <<EOT
set -eux
apt-get install -y libgpgme-dev
curl -L -o cedana_amd64.deb https://github.com/cedana/cedana/releases/download/v${CEDANA_VERSION}/cedana_${CEDANA_VERSION}_amd64.deb
dpkg -i cedana_amd64.deb
rm cedana_amd64.deb
EOT

# Python 3.12
# ========================
FROM base as py312
Expand Down
25 changes: 21 additions & 4 deletions docker/Dockerfile.worker
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ RUN go build -o /usr/local/bin/worker ./cmd/worker/main.go

# final image
# ========================
FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04 AS release
FROM nvidia/cuda:12.4.1-runtime-ubuntu22.04 AS release
FROM release AS dev

FROM ${BASE_STAGE} AS final
Expand All @@ -81,8 +81,6 @@ WORKDIR /workspace

RUN apt-get update && \
apt-get install -y curl gpg && \
curl -fsSL https://download.opensuse.org/repositories/devel:/tools:/criu/xUbuntu_22.04/Release.key | gpg --dearmor -o /usr/share/keyrings/criu.gpg && \
echo 'deb [signed-by=/usr/share/keyrings/criu.gpg] https://download.opensuse.org/repositories/devel:/tools:/criu/xUbuntu_22.04 /' > /etc/apt/sources.list.d/criu.list && \
curl -fsSL https://nvidia.github.io/nvidia-container-runtime/gpgkey | apt-key add - && \
curl -s -L https://nvidia.github.io/nvidia-docker/ubuntu22.04/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list \
curl -fsSL https://nvidia.github.io/nvidia-container-runtime/ubuntu22.04/nvidia-container-runtime.list | tee /etc/apt/sources.list.d/nvidia-container-runtime.list && \
Expand All @@ -91,10 +89,29 @@ RUN apt-get update && \

RUN curl -L https://beam-runner-python-deps.s3.amazonaws.com/juicefs -o /usr/local/bin/juicefs && chmod +x /usr/local/bin/juicefs
RUN curl -fsSL https://tailscale.com/install.sh | sh
RUN apt-get install -y --no-install-recommends criu nvidia-container-toolkit-base nvidia-container-toolkit
RUN apt-get install -y --no-install-recommends nvidia-container-toolkit-base nvidia-container-toolkit

RUN apt-get update && apt-get install -y fuse3 libfuse2 libfuse3-dev libfuse-dev bash-completion

# XXX: Remove once cedana starts shipping with a compatible binary
RUN <<EOT
set -eux
apt-get install -y python3-protobuf libnet1 libnftables1 libnl-3-200 libprotobuf-c1 iptables
curl -L -o criu_3.19-4_amd64.deb https://download.opensuse.org/repositories/devel:/tools:/criu/xUbuntu_22.04/amd64/criu_3.19-4_amd64.deb
dpkg -i criu_3.19-4_amd64.deb
rm criu_3.19-4_amd64.deb
EOT

ARG CEDANA_VERSION=0.9.220
RUN <<EOT
set -eux
apt-get install -y libgpgme-dev
curl -L -o cedana_amd64.deb https://github.com/cedana/cedana/releases/download/v${CEDANA_VERSION}/cedana_${CEDANA_VERSION}_amd64.deb
dpkg -i cedana_amd64.deb
rm cedana_amd64.deb
EOT


ARG TARGETARCH

ENV MOUNT_S3_URL_ARM64="https://s3.amazonaws.com/mountpoint-s3-release/1.8.0/arm64/mount-s3-1.8.0-arm64.tar.gz"
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ require (
github.com/beam-cloud/clip v0.0.0-20240826223025-899feb184e88
github.com/beam-cloud/go-runc v0.0.0-20231222221338-b89899f33170
github.com/bsm/redislock v0.9.4
github.com/cedana/cedana v0.9.222
github.com/cedana/cedana v0.9.227
github.com/cenkalti/backoff v2.2.1+incompatible
github.com/cloudevents/sdk-go/v2 v2.15.1
github.com/coreos/go-iptables v0.7.1-0.20240112124308-65c67c9f46e6
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,8 @@ github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA=
github.com/bsm/gomega v1.27.10/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0H+O0=
github.com/bsm/redislock v0.9.4 h1:X/Wse1DPpiQgHbVYRE9zv6m070UcKoOGekgvpNhiSvw=
github.com/bsm/redislock v0.9.4/go.mod h1:Epf7AJLiSFwLCiZcfi6pWFO/8eAYrYpQXFxEDPoDeAk=
github.com/cedana/cedana v0.9.222 h1:ZIUuHWP3hRxrkPE/p108SJ079kdLNbBXx+CMV3+ywsU=
github.com/cedana/cedana v0.9.222/go.mod h1:IQDFr9/H9Opl2ym3xQoFoqLCO/KzebVMWkO7Mta3egk=
github.com/cedana/cedana v0.9.227 h1:HAewwTJGkUzc4bVPs31zVdHAVxulQ7SVzEjeoNE9RfM=
github.com/cedana/cedana v0.9.227/go.mod h1:fxb69FbpPSsN+Xa+mEYvOCy1rDAOv+v38meBF0SDGRw=
github.com/cenkalti/backoff v2.2.1+incompatible h1:tNowT99t7UNflLxfYYSlKYsBpXdEet03Pg2g16Swow4=
github.com/cenkalti/backoff v2.2.1+incompatible/go.mod h1:90ReRw6GdpyfrHakVjL/QHaoyV4aDUVVkXQJJJ3NXXM=
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
Expand Down
34 changes: 17 additions & 17 deletions pkg/common/config.default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -101,19 +101,19 @@ worker:
minFreeMemory: 32Gi
sharedMemoryLimitPct: 100%
# example gpu worker pool
# nvidia:
# mode: local
# gpuType: any
# runtime: nvidia
# jobSpec:
# nodeSelector: {}
# poolSizing:
# defaultWorkerCpu: 1000m
# defaultWorkerGpuType: ""
# defaultWorkerMemory: 1Gi
# minFreeCpu:
# minFreeGpu:
# minFreeMemory:
nvidia:
mode: local
gpuType: any
runtime: nvidia
jobSpec:
nodeSelector: {}
poolSizing:
defaultWorkerCpu: 1000m
defaultWorkerGpuType: ""
defaultWorkerMemory: 1Gi
minFreeCpu:
minFreeGpu:
minFreeMemory:
# global pool attributes
useHostResolvConf: true
hostNetwork: false
Expand Down Expand Up @@ -182,14 +182,14 @@ monitoring:
serverUrl: ""
apiKey: ""
checkpointing:
enabled: true
cedana:
client:
leaveRunning: true
sharedStorage:
dumpStorageDir: /data
connection: # needed to download GPU binaries
cedanaUrl: auth.cedana.com
cedanaUser: user
cedanaAuthToken: token
connection:
cedanaUrl:
cedanaAuthToken:
cli:
waitForReady: true
3 changes: 2 additions & 1 deletion pkg/types/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -379,5 +379,6 @@ type FluentBitEventConfig struct {
}

type CheckpointingConfig struct {
Cedana cedana.Config `key:"cedana" json:"cedana"`
Enabled bool `key:"enabled" json:"enabled"`
Cedana cedana.Config `key:"cedana" json:"cedana"`
}

0 comments on commit 5ae4f87

Please sign in to comment.