From f252af3466e79ea07d740f78f78215a9e92f104d Mon Sep 17 00:00:00 2001 From: davidgortega Date: Wed, 20 Jan 2021 02:14:41 +0100 Subject: [PATCH] GPU support without restart --- cml/ami.json | 6 ------ cml/setup.sh | 13 ++++++++++--- iterative/resource_runner.go | 27 +++++++++++++++++++-------- 3 files changed, 29 insertions(+), 17 deletions(-) diff --git a/cml/ami.json b/cml/ami.json index 31108b09..9da94302 100644 --- a/cml/ami.json +++ b/cml/ami.json @@ -33,12 +33,6 @@ "type": "shell", "environment_vars": ["DEBIAN_FRONTEND=noninteractive"], "script": "./setup.sh" - }, - { - "type": "shell", - "inline": ["sudo shutdown -r now", "sleep 60"], - "start_retry_timeout": "10m", - "expect_disconnect": true } ] } diff --git a/cml/setup.sh b/cml/setup.sh index 0fb817d5..fe05e56c 100755 --- a/cml/setup.sh +++ b/cml/setup.sh @@ -6,6 +6,7 @@ echo "APT::Get::Assume-Yes \"true\";" | sudo tee -a /etc/apt/apt.conf.d/90assume sudo apt remove unattended-upgrades systemctl disable apt-daily-upgrade.service +sudo add-apt-repository universe -y sudo add-apt-repository ppa:git-core/ppa -y sudo apt update && sudo apt-get install -y git sudo curl -fsSL https://get.docker.com -o get-docker.sh && sudo sh get-docker.sh && \ @@ -21,6 +22,12 @@ sudo apt update && sudo apt-get install -y nodejs sudo apt install -y ubuntu-drivers-common git sudo ubuntu-drivers autoinstall -curl -s -L https://nvidia.GitHub.io/nvidia-docker/gpgkey | sudo apt-key add - && \ -curl -s -L https://nvidia.GitHub.io/nvidia-docker/ubuntu18.04/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list -sudo apt update && sudo apt install -y nvidia-container-toolkit + +curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - +curl -s -L https://nvidia.github.io/nvidia-docker/ubuntu18.04/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list +sudo apt update && sudo apt install -y nvidia-docker2 + +sudo systemctl restart docker + +sudo nvidia-smi +sudo docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi diff --git a/iterative/resource_runner.go b/iterative/resource_runner.go index e2276220..8413c052 100644 --- a/iterative/resource_runner.go +++ b/iterative/resource_runner.go @@ -228,6 +228,7 @@ func provisionerCode(d *schema.ResourceData) (string, error) { data["idle_timeout"] = strconv.Itoa(d.Get("idle_timeout").(int)) data["name"] = d.Get("name").(string) data["tf_resource"] = base64.StdEncoding.EncodeToString(jsonResource) + data["instance_gpu"] = d.Get("instance_gpu").(string) data["AWS_SECRET_ACCESS_KEY"] = os.Getenv("AWS_SECRET_ACCESS_KEY") data["AWS_ACCESS_KEY_ID"] = os.Getenv("AWS_ACCESS_KEY_ID") data["AZURE_CLIENT_ID"] = os.Getenv("AZURE_CLIENT_ID") @@ -235,32 +236,42 @@ func provisionerCode(d *schema.ResourceData) (string, error) { data["AZURE_SUBSCRIPTION_ID"] = os.Getenv("AZURE_SUBSCRIPTION_ID") data["AZURE_TENANT_ID"] = os.Getenv("AZURE_TENANT_ID") - tmpl, err := template.New("deploy").Parse(`#!/bin/bash + tmpl, err := template.New("deploy").Parse(`#!/bin/sh export DEBIAN_FRONTEND=noninteractive {{if eq .cloud "azure"}} echo "APT::Get::Assume-Yes \"true\";" | sudo tee -a /etc/apt/apt.conf.d/90assumeyes sudo apt remove unattended-upgrades -systemctl disable apt-daily-upgrade.service +systemctl disable apt-daily-upgrade.service +sudo add-apt-repository universe -y sudo add-apt-repository ppa:git-core/ppa -y sudo apt update && sudo apt-get install -y git sudo curl -fsSL https://get.docker.com -o get-docker.sh && sudo sh get-docker.sh sudo usermod -aG docker ubuntu sudo setfacl --modify user:ubuntu:rw /var/run/docker.sock + curl -fsSL https://apt.releases.hashicorp.com/gpg | sudo apt-key add - sudo apt-add-repository "deb [arch=amd64] https://apt.releases.hashicorp.com $(lsb_release -cs) main" sudo apt update && sudo apt-get install -y terraform + curl -sL https://deb.nodesource.com/setup_12.x | sudo bash sudo apt update && sudo apt-get install -y nodejs + sudo apt install -y ubuntu-drivers-common git -sudo ubuntu-drivers autoinstall -sudo rmmod nvidia && sudo nvidia-smi -curl -s -L https://nvidia.GitHub.io/nvidia-docker/gpgkey | sudo apt-key add - -curl -s -L https://nvidia.GitHub.io/nvidia-docker/ubuntu18.04/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list -sudo apt update && sudo apt install -y nvidia-container-toolkit +sudo ubuntu-drivers autoinstall + +curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - +curl -s -L https://nvidia.github.io/nvidia-docker/ubuntu18.04/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list +sudo apt update && sudo apt install -y nvidia-docker2 + +sudo systemctl restart docker + +sudo nvidia-smi +sudo docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi {{end}} + sudo npm install -g git+https://github.com/iterative/cml.git#cml-runner export HOME=/root export AWS_SECRET_ACCESS_KEY={{.AWS_SECRET_ACCESS_KEY}} @@ -269,7 +280,7 @@ export AZURE_CLIENT_ID={{.AZURE_CLIENT_ID}} export AZURE_CLIENT_SECRET={{.AZURE_CLIENT_SECRET}} export AZURE_SUBSCRIPTION_ID={{.AZURE_SUBSCRIPTION_ID}} export AZURE_TENANT_ID={{.AZURE_TENANT_ID}} -nohup cml-runner{{if .name}} --name {{.name}}{{end}}{{if .labels}} --labels {{.labels}}{{end}}{{if .idle_timeout}} --idle-timeout {{.idle_timeout}}{{end}}{{if .driver}} --driver {{.driver}}{{end}}{{if .repo}} --repo {{.repo}}{{end}}{{if .token}} --token {{.token}}{{end}}{{if .tf_resource}} --tf_resource={{.tf_resource}}{{end}} < /dev/null > std.out 2> std.err & +nohup cml-runner{{if .name}} --name {{.name}}{{end}}{{if .labels}} --labels {{.labels}}{{end}}{{if .idle_timeout}} --idle-timeout {{.idle_timeout}}{{end}}{{if .driver}} --driver {{.driver}}{{end}}{{if .repo}} --repo {{.repo}}{{end}}{{if .token}} --token {{.token}}{{end}}{{if .tf_resource}} --tf_resource={{.tf_resource}}{{end}} {{if .instance_gpu}} --cloud-gpu {{.instance_gpu}}{{end}} < /dev/null > std.out 2> std.err & sleep 10 `) var customDataBuffer bytes.Buffer