Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix GPU support with k3d #59

Merged
merged 8 commits into from
Jan 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,6 @@
!go.mod
!go.sum
!sdk
!docker
**/__pycache__
*.pyc
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,8 @@ __pycache__
venv
.python-version
.DS_Store
.terraform*
terraform.tfstate*
terraform.tfvars
sdk/src/build
sdk/src/Beam.egg-info
sdk/src/Beam.egg-info
7 changes: 3 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,18 @@ workerTag := latest
runnerTag := latest

setup:
bash bin/setup.sh
make k3d-up beam-runner beam-worker beam
kubectl delete pod -l app=beam

setup-sdk:
poetry install -C sdk

k3d-up:
k3d cluster create --config hack/k3d.yaml
kubectl config set contexts.k3d-beam.namespace beam
okteto context use k3d-beam --namespace beam
bash bin/k3d.sh up

k3d-down:
k3d cluster delete --config hack/k3d.yaml
bash bin/k3d.sh down

beam:
docker build . --target build -f ./docker/Dockerfile.beam -t localhost:5001/beam:$(tag)
Expand Down
44 changes: 44 additions & 0 deletions bin/k3d.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/usr/bin/env bash

set -eu

check_gpu_linux() {
if command -v nvidia-smi >/dev/null 2>&1; then
nvidia-smi -L | grep -q "GPU"
return $?
else
echo "nvidia-smi command not found."
return 1
fi
}

k3d_up() {
os_type="$(uname)"
case "$os_type" in
Linux*)
check_gpu_linux && extra_args="--gpus=all --image=localhost:5001/rancher/k3s:latest" || extra_args=""
docker build . -f ./docker/Dockerfile.k3d -t localhost:5001/rancher/k3s:latest
;;
Darwin*)
extra_args=""
;;
*)
echo "Unsupported OS: $os_type"
exit 1
;;
esac

k3d cluster create --config hack/k3d.yaml $extra_args
kubectl config set contexts.k3d-beam.namespace beam
okteto context use k3d-beam --namespace beam
}

k3d_down() {
k3d cluster delete --config hack/k3d.yaml
}

case "$1" in
up) k3d_up ;;
down) k3d_down ;;
*) echo "Unsupported command: $1"; exit 1 ;;
esac
28 changes: 28 additions & 0 deletions bin/setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/usr/bin/env bash

set +xeu

os=$(uname -s | tr '[:upper:]' '[:lower:]')

if [ "$(uname -m)" = "arm64" ]; then
arch="arm64"
elif [ "$(uname -m)" = "x86_64" ]; then
arch="amd64"
fi

k8s_version=$(curl -sSfL https://dl.k8s.io/release/stable.txt)
stern_version="1.28.0"

echo "Installing kubectl"
curl -sSfL "https://dl.k8s.io/release/${k8s_version}/bin/${os}/${arch}/kubectl" > /usr/local/bin/kubectl
chmod +x /usr/local/bin/kubectl

echo "Installing stern"
curl -sSfL "https://github.com/stern/stern/releases/download/v${stern_version}/stern_${stern_version}_${os}_${arch}.tar.gz" | tar -xz -C /usr/local/bin stern
chmod +x /usr/local/bin/stern

echo "Installing okteto"
curl -sSfL https://get.okteto.com | sh

echo "Installing k3d"
curl -sSfL https://raw.githubusercontent.com/k3d-io/k3d/main/install.sh | bash
24 changes: 24 additions & 0 deletions deploy/crusoe/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Beam Arc on Crusoe Cloud

This will help you run a single instance of Beam Arc on [Crusoe Cloud](https://docs.crusoecloud.com/).

## Prereqs

1. Install terraform.
1. Configure your Crusoe config ([docs](https://docs.crusoecloud.com/quickstart/installing-the-cli/index.html#configure-the-cli)).

## Getting started

1. Find your project ID.
1. Make a new SSH key, or point to an existing one.
1. Create a `terraform.tfvars` file in this directory.
```
project_id = "<uuid of your project>"
ssh_key_path = "~/.ssh/id_crusoecloud.pub"
```
1. Apply your Terraform.

```sh
terraform init
terraform apply
```
8 changes: 8 additions & 0 deletions deploy/crusoe/config.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
terraform {
required_providers {
crusoe = {
source = "registry.terraform.io/crusoecloud/crusoe"
version = "0.5.3"
}
}
}
58 changes: 58 additions & 0 deletions deploy/crusoe/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
locals {
name = "beam-arc"
ssh_key_content = file(var.ssh_key_path)
}

resource "crusoe_compute_instance" "this" {
name = local.name
image = "ubuntu20.04-nvidia-pcie-docker:latest"
type = var.instance_type
ssh_key = local.ssh_key_content
location = var.location
project_id = var.project_id

startup_script = <<-EOF
#!/bin/bash
mkdir /data
mkfs.ext4 /dev/vda
mount -t ext4 /dev/vda /data

# cd /data
# git clone https://github.com/beam-cloud/beam.git
# cd beam
# make setup
EOF

disks = [
{
id = crusoe_storage_disk.data.id
mode = "read-write"
attachment_type = "data"
}
]

depends_on = [crusoe_storage_disk.data]
}

resource "crusoe_storage_disk" "data" {
name = local.name
size = "400GiB"
location = var.location
project_id = var.project_id
}

data "crusoe_vpc_networks" "this" {}

resource "crusoe_vpc_firewall_rule" "ingress" {
network = data.crusoe_vpc_networks.this.vpc_networks[0].id
name = local.name
action = "allow"
direction = "ingress"
protocols = "tcp"
source = "0.0.0.0/0"
source_ports = "1993-1994"
destination = data.crusoe_vpc_networks.this.vpc_networks[0].cidr
destination_ports = "1-65535"

depends_on = [data.crusoe_vpc_networks.this]
}
3 changes: 3 additions & 0 deletions deploy/crusoe/outputs.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
output "public_ipv4" {
value = crusoe_compute_instance.this.network_interfaces[0].public_ipv4.address
}
20 changes: 20 additions & 0 deletions deploy/crusoe/variables.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
variable "project_id" {
description = "UUID of your project."
type = string
}

variable "ssh_key_path" {
description = "Path to your public SSH key."
type = string
}

variable "location" {
description = "Location to deploy your resources."
type = string
default = "us-northcentral1-a"
}

variable "instance_type" {
type = string
default = "a40.1x"
}
36 changes: 36 additions & 0 deletions docker/Dockerfile.k3d
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# syntax=docker/dockerfile:1.6
FROM rancher/k3s:v1.28.5-k3s1 as k3s
FROM nvidia/cuda:12.3.1-base-ubuntu20.04

ENV CRI_CONFIG_FILE=/var/lib/rancher/k3s/agent/etc/crictl.yaml
ENV PATH="$PATH:/bin/aux"

RUN <<EOT
set -eu
echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections
apt-get update
apt-get -y install gnupg2 curl
curl -sL https://nvidia.github.io/nvidia-container-runtime/gpgkey | apt-key add -
curl -sL https://nvidia.github.io/nvidia-container-runtime/ubuntu20.04/nvidia-container-runtime.list | tee /etc/apt/sources.list.d/nvidia-container-runtime.list
apt-get update
apt-get -y install nvidia-container-toolkit-base nvidia-container-toolkit nvidia-container-runtime util-linux

mkdir -vp /etc && echo 'hosts: files dns' > /etc/nsswitch.conf
chmod 1777 /tmp
mkdir -vp /var/lib/rancher/k3s/agent/etc/containerd/

apt-get clean
apt-get autoremove -y
apt-get autopurge -y
rm -rf /var/lib/apt/lists/* /var/log/*
EOT

COPY --from=k3s /bin /bin

VOLUME /var/lib/kubelet
VOLUME /var/lib/rancher/k3s
VOLUME /var/lib/cni
VOLUME /var/log

ENTRYPOINT ["/bin/k3s"]
CMD ["agent"]
70 changes: 36 additions & 34 deletions docker/Dockerfile.worker
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,10 @@ ARG BASE_STAGE=dev

FROM golang:1.21-bullseye AS golang

RUN <<EOT
set -eux
apt-get update
apt-get install -y --no-install-recommends curl git
EOT
RUN apt-get update && apt-get install -y curl git


# Skopeo
# skopeo
# ========================
FROM golang AS skopeo

Expand All @@ -30,13 +26,13 @@ make install
EOT


# RUNC
# runc
# ========================
FROM golang AS runc

WORKDIR /workspace

RUN apt-get install -y --no-install-recommends libseccomp-dev
RUN apt-get install -y libseccomp-dev

RUN <<EOT
set -eux
Expand All @@ -47,7 +43,22 @@ make install
EOT


# Beam Worker
# nvidia-container-toolkit
# ========================
FROM golang AS nvidia-container-toolkit

WORKDIR /workspace

RUN apt update && apt install -y build-essential

RUN <<EOT
git clone -b np/update --single-branch https://github.com/beam-cloud/nvidia-container-toolkit.git .
make build
make binaries
EOT


# beam worker
# ========================
FROM golang AS worker

Expand All @@ -60,45 +71,36 @@ COPY . .
RUN go build -o /usr/local/bin/worker ./cmd/worker/main.go


# NVIDIA CUDA - Final Stage
# final image
# ========================
FROM nvidia/cuda:12.3.1-base-ubuntu20.04 AS release
FROM release AS dev

FROM ${BASE_STAGE} AS final

ENV DEBIAN_FRONTEND="noninteractive"
WORKDIR /workspace

RUN apt-get update && \
apt-get install -y --no-install-recommends --no-install-recommends curl gpg fuse3
apt-get install -y curl gpg fuse3 && \
curl -fsSL https://download.opensuse.org/repositories/devel:/tools:/criu/xUbuntu_20.04/Release.key | gpg --dearmor -o /usr/share/keyrings/criu.gpg && \
echo 'deb [signed-by=/usr/share/keyrings/criu.gpg] https://download.opensuse.org/repositories/devel:/tools:/criu/xUbuntu_20.04 /' > /etc/apt/sources.list.d/criu.list && \
curl -fsSL https://nvidia.github.io/nvidia-container-runtime/gpgkey | apt-key add - && \
curl -fsSL https://nvidia.github.io/nvidia-container-runtime/ubuntu20.04/nvidia-container-runtime.list | tee /etc/apt/sources.list.d/nvidia-container-runtime.list && \
apt-get update

RUN <<EOT
set -eux
RUN curl -sSL https://d.juicefs.com/install | sh -
RUN apt-get install -y --no-install-recommends criu nvidia-container-toolkit-base nvidia-container-toolkit

# JuiceFS
curl -sSL https://d.juicefs.com/install | sh -

# nvidia-container-toolkit repo
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
echo 'deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://nvidia.github.io/libnvidia-container/stable/deb/$(ARCH) /' > /etc/apt/sources.list.d/nvidia-container-toolkit.list

# criu repo
curl -fsSL https://download.opensuse.org/repositories/devel:/tools:/criu/xUbuntu_20.04/Release.key | gpg --dearmor -o /usr/share/keyrings/criu.gpg
echo 'deb [signed-by=/usr/share/keyrings/criu.gpg] https://download.opensuse.org/repositories/devel:/tools:/criu/xUbuntu_20.04 /' > /etc/apt/sources.list.d/criu.list

apt-get update
apt-get install -y --no-install-recommends nvidia-container-runtime criu
apt-get remove -y curl gpg
apt-get clean
apt-get autoremove -y
apt-get autopurge -y
rm -rf /var/lib/apt/lists/* /var/log/*
EOT
RUN apt-get remove -y curl gpg && \
apt-get clean && apt-get autoremove -y && apt-get autopurge -y && \
rm -rf /var/lib/apt/lists/* /var/log/*

COPY --from=runc /usr/local/sbin/runc /usr/local/sbin/runc
COPY --from=skopeo /usr/local/bin/skopeo /usr/local/bin/skopeo
COPY --from=skopeo /workspace/default-policy.json /etc/containers/policy.json
COPY --from=nvidia-container-toolkit /workspace/nvidia-container-runtime* /usr/bin/
COPY --from=worker /usr/local/bin/worker /usr/local/bin/worker
COPY ./sdk/src/beam /workspace/sdk

VOLUME ["/usr/lib/x86_64-linux-gnu", "/usr/lib/aarch64-linux-gnu"]
VOLUME "/usr/lib/x86_64-linux-gnu"
VOLUME "/usr/lib/aarch64-linux-gnu"
Loading