From 00dbfa7e34356d3436250f05f2a4ba12077303be Mon Sep 17 00:00:00 2001 From: nailixing Date: Wed, 20 May 2020 14:35:43 +0800 Subject: [PATCH] Add gpu plugin configurations --- scripts/kubernetes/nvidia-device-plugin.yml | 40 +++++++++++++++++++++ scripts/kubernetes/start.sh | 2 ++ 2 files changed, 42 insertions(+) create mode 100644 scripts/kubernetes/nvidia-device-plugin.yml diff --git a/scripts/kubernetes/nvidia-device-plugin.yml b/scripts/kubernetes/nvidia-device-plugin.yml new file mode 100644 index 00000000..2bdb4e92 --- /dev/null +++ b/scripts/kubernetes/nvidia-device-plugin.yml @@ -0,0 +1,40 @@ +apiVersion: extensions/v1beta1 +kind: DaemonSet +metadata: + name: nvidia-device-plugin-daemonset-1.12 + namespace: kube-system +spec: + updateStrategy: + type: RollingUpdate + template: + metadata: + # Mark this pod as a critical add-on; when enabled, the critical add-on scheduler + # reserves resources for critical add-on pods so that they can be rescheduled after + # a failure. This annotation works in tandem with the toleration below. + annotations: + scheduler.alpha.kubernetes.io/critical-pod: "" + labels: + name: nvidia-device-plugin-ds + spec: + tolerations: + # Allow this pod to be rescheduled while the node is in "critical add-ons only" mode. + # This, along with the annotation above marks this pod as a critical add-on. + - key: CriticalAddonsOnly + operator: Exists + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + containers: + - image: nvidia/k8s-device-plugin:1.11 + name: nvidia-device-plugin-ctr + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + volumes: + - name: device-plugin + hostPath: + path: /var/lib/kubelet/device-plugins diff --git a/scripts/kubernetes/start.sh b/scripts/kubernetes/start.sh index 221528e4..6949a8c9 100644 --- a/scripts/kubernetes/start.sh +++ b/scripts/kubernetes/start.sh @@ -67,7 +67,9 @@ echo "Deploy ingress-nginx" # customer yaml: add replica to 3, fix the port to 3005 kubectl apply -f scripts/kubernetes/ingress_controller_deploy.yaml || exit 1 fi +echo "Deploy GPU plugin" +kubectl create -f ./scripts/kubernetes/nvidia-device-plugin.yml echo "To use Rafiki, use Rafiki Client in the Python CLI" echo "A quickstart is available at https://nginyc.github.io/rafiki/docs/latest/docs/src/user/quickstart.html"