From 00dbfa7e34356d3436250f05f2a4ba12077303be Mon Sep 17 00:00:00 2001
From: nailixing <xnlainus809>
Date: Wed, 20 May 2020 14:35:43 +0800
Subject: [PATCH] Add gpu plugin configurations

---
 scripts/kubernetes/nvidia-device-plugin.yml | 40 +++++++++++++++++++++
 scripts/kubernetes/start.sh                 |  2 ++
 2 files changed, 42 insertions(+)
 create mode 100644 scripts/kubernetes/nvidia-device-plugin.yml
diff --git a/scripts/kubernetes/nvidia-device-plugin.yml b/scripts/kubernetes/nvidia-device-plugin.yml
new file mode 100644
index 00000000..2bdb4e92
--- /dev/null
+++ b/scripts/kubernetes/nvidia-device-plugin.yml
@@ -0,0 +1,40 @@
+apiVersion: extensions/v1beta1
+kind: DaemonSet
+metadata:
+  name: nvidia-device-plugin-daemonset-1.12
+  namespace: kube-system
+spec:
+  updateStrategy:
+    type: RollingUpdate
+  template:
+    metadata:
+      # Mark this pod as a critical add-on; when enabled, the critical add-on scheduler
+      # reserves resources for critical add-on pods so that they can be rescheduled after
+      # a failure.  This annotation works in tandem with the toleration below.
+      annotations:
+        scheduler.alpha.kubernetes.io/critical-pod: ""
+      labels:
+        name: nvidia-device-plugin-ds
+    spec:
+      tolerations:
+      # Allow this pod to be rescheduled while the node is in "critical add-ons only" mode.
+      # This, along with the annotation above marks this pod as a critical add-on.
+      - key: CriticalAddonsOnly
+        operator: Exists
+      - key: nvidia.com/gpu
+        operator: Exists
+        effect: NoSchedule
+      containers:
+      - image: nvidia/k8s-device-plugin:1.11
+        name: nvidia-device-plugin-ctr
+        securityContext:
+          allowPrivilegeEscalation: false
+          capabilities:
+            drop: ["ALL"]
+        volumeMounts:
+          - name: device-plugin
+            mountPath: /var/lib/kubelet/device-plugins
+      volumes:
+        - name: device-plugin
+          hostPath:
+            path: /var/lib/kubelet/device-plugins
diff --git a/scripts/kubernetes/start.sh b/scripts/kubernetes/start.sh
index 221528e4..6949a8c9 100644
--- a/scripts/kubernetes/start.sh
+++ b/scripts/kubernetes/start.sh
@@ -67,7 +67,9 @@ echo "Deploy ingress-nginx"
       # customer yaml: add replica to 3, fix the port to 3005
       kubectl apply -f scripts/kubernetes/ingress_controller_deploy.yaml || exit 1
     fi
+echo "Deploy GPU plugin"
 
+kubectl create -f ./scripts/kubernetes/nvidia-device-plugin.yml
 
 echo "To use Rafiki, use Rafiki Client in the Python CLI"
 echo "A quickstart is available at https://nginyc.github.io/rafiki/docs/latest/docs/src/user/quickstart.html"