Skip to content

Commit

Permalink
GPU support without restart
Browse files Browse the repository at this point in the history
  • Loading branch information
DavidGOrtega committed Jan 20, 2021
1 parent e14146d commit f252af3
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 17 deletions.
6 changes: 0 additions & 6 deletions cml/ami.json
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,6 @@
"type": "shell",
"environment_vars": ["DEBIAN_FRONTEND=noninteractive"],
"script": "./setup.sh"
},
{
"type": "shell",
"inline": ["sudo shutdown -r now", "sleep 60"],
"start_retry_timeout": "10m",
"expect_disconnect": true
}
]
}
13 changes: 10 additions & 3 deletions cml/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ echo "APT::Get::Assume-Yes \"true\";" | sudo tee -a /etc/apt/apt.conf.d/90assume
sudo apt remove unattended-upgrades
systemctl disable apt-daily-upgrade.service

sudo add-apt-repository universe -y
sudo add-apt-repository ppa:git-core/ppa -y
sudo apt update && sudo apt-get install -y git
sudo curl -fsSL https://get.docker.com -o get-docker.sh && sudo sh get-docker.sh && \
Expand All @@ -21,6 +22,12 @@ sudo apt update && sudo apt-get install -y nodejs

sudo apt install -y ubuntu-drivers-common git
sudo ubuntu-drivers autoinstall
curl -s -L https://nvidia.GitHub.io/nvidia-docker/gpgkey | sudo apt-key add - && \
curl -s -L https://nvidia.GitHub.io/nvidia-docker/ubuntu18.04/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
sudo apt update && sudo apt install -y nvidia-container-toolkit

curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
curl -s -L https://nvidia.github.io/nvidia-docker/ubuntu18.04/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
sudo apt update && sudo apt install -y nvidia-docker2

sudo systemctl restart docker

sudo nvidia-smi
sudo docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi
27 changes: 19 additions & 8 deletions iterative/resource_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -228,39 +228,50 @@ func provisionerCode(d *schema.ResourceData) (string, error) {
data["idle_timeout"] = strconv.Itoa(d.Get("idle_timeout").(int))
data["name"] = d.Get("name").(string)
data["tf_resource"] = base64.StdEncoding.EncodeToString(jsonResource)
data["instance_gpu"] = d.Get("instance_gpu").(string)
data["AWS_SECRET_ACCESS_KEY"] = os.Getenv("AWS_SECRET_ACCESS_KEY")
data["AWS_ACCESS_KEY_ID"] = os.Getenv("AWS_ACCESS_KEY_ID")
data["AZURE_CLIENT_ID"] = os.Getenv("AZURE_CLIENT_ID")
data["AZURE_CLIENT_SECRET"] = os.Getenv("AZURE_CLIENT_SECRET")
data["AZURE_SUBSCRIPTION_ID"] = os.Getenv("AZURE_SUBSCRIPTION_ID")
data["AZURE_TENANT_ID"] = os.Getenv("AZURE_TENANT_ID")

tmpl, err := template.New("deploy").Parse(`#!/bin/bash
tmpl, err := template.New("deploy").Parse(`#!/bin/sh
export DEBIAN_FRONTEND=noninteractive
{{if eq .cloud "azure"}}
echo "APT::Get::Assume-Yes \"true\";" | sudo tee -a /etc/apt/apt.conf.d/90assumeyes
sudo apt remove unattended-upgrades
systemctl disable apt-daily-upgrade.service
systemctl disable apt-daily-upgrade.service
sudo add-apt-repository universe -y
sudo add-apt-repository ppa:git-core/ppa -y
sudo apt update && sudo apt-get install -y git
sudo curl -fsSL https://get.docker.com -o get-docker.sh && sudo sh get-docker.sh
sudo usermod -aG docker ubuntu
sudo setfacl --modify user:ubuntu:rw /var/run/docker.sock
curl -fsSL https://apt.releases.hashicorp.com/gpg | sudo apt-key add -
sudo apt-add-repository "deb [arch=amd64] https://apt.releases.hashicorp.com $(lsb_release -cs) main"
sudo apt update && sudo apt-get install -y terraform
curl -sL https://deb.nodesource.com/setup_12.x | sudo bash
sudo apt update && sudo apt-get install -y nodejs
sudo apt install -y ubuntu-drivers-common git
sudo ubuntu-drivers autoinstall
sudo rmmod nvidia && sudo nvidia-smi
curl -s -L https://nvidia.GitHub.io/nvidia-docker/gpgkey | sudo apt-key add -
curl -s -L https://nvidia.GitHub.io/nvidia-docker/ubuntu18.04/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
sudo apt update && sudo apt install -y nvidia-container-toolkit
sudo ubuntu-drivers autoinstall
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
curl -s -L https://nvidia.github.io/nvidia-docker/ubuntu18.04/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
sudo apt update && sudo apt install -y nvidia-docker2
sudo systemctl restart docker
sudo nvidia-smi
sudo docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi
{{end}}
sudo npm install -g git+https://github.com/iterative/cml.git#cml-runner
export HOME=/root
export AWS_SECRET_ACCESS_KEY={{.AWS_SECRET_ACCESS_KEY}}
Expand All @@ -269,7 +280,7 @@ export AZURE_CLIENT_ID={{.AZURE_CLIENT_ID}}
export AZURE_CLIENT_SECRET={{.AZURE_CLIENT_SECRET}}
export AZURE_SUBSCRIPTION_ID={{.AZURE_SUBSCRIPTION_ID}}
export AZURE_TENANT_ID={{.AZURE_TENANT_ID}}
nohup cml-runner{{if .name}} --name {{.name}}{{end}}{{if .labels}} --labels {{.labels}}{{end}}{{if .idle_timeout}} --idle-timeout {{.idle_timeout}}{{end}}{{if .driver}} --driver {{.driver}}{{end}}{{if .repo}} --repo {{.repo}}{{end}}{{if .token}} --token {{.token}}{{end}}{{if .tf_resource}} --tf_resource={{.tf_resource}}{{end}} < /dev/null > std.out 2> std.err &
nohup cml-runner{{if .name}} --name {{.name}}{{end}}{{if .labels}} --labels {{.labels}}{{end}}{{if .idle_timeout}} --idle-timeout {{.idle_timeout}}{{end}}{{if .driver}} --driver {{.driver}}{{end}}{{if .repo}} --repo {{.repo}}{{end}}{{if .token}} --token {{.token}}{{end}}{{if .tf_resource}} --tf_resource={{.tf_resource}}{{end}} {{if .instance_gpu}} --cloud-gpu {{.instance_gpu}}{{end}} < /dev/null > std.out 2> std.err &
sleep 10
`)
var customDataBuffer bytes.Buffer
Expand Down

0 comments on commit f252af3

Please sign in to comment.