feat: Neuron scheduler update for trainium-inferentia blueprints (#624)

awslabs · Aug 30, 2024 · ff14530 · ff14530
1 parent 6a92682
commit ff14530
Show file tree

Hide file tree

Showing 6 changed files with 112 additions and 102 deletions.
diff --git a/gen-ai/inference/llama2-13b-chat-rayserve-inf2/ray-service-llama2.yaml b/gen-ai/inference/llama2-13b-chat-rayserve-inf2/ray-service-llama2.yaml
@@ -49,83 +49,85 @@ spec:
       rayStartParams:
         dashboard-host: '0.0.0.0'
       template:
+        schedulerName: my-scheduler
         spec:
           containers:
-            - name: head
-              image: public.ecr.aws/data-on-eks/ray2.22.0-py310-llama2-13b-neuron:latest # Image created using the Dockerfile attached in the folder
-              imagePullPolicy: Always # Ensure the image is always pulled when updated
-              lifecycle:
-                preStop:
-                  exec:
-                    command: ["/bin/sh", "-c", "ray stop"]
-              ports:
-                - containerPort: 6379
-                  name: gcs
-                - containerPort: 8265
-                  name: dashboard
-                - containerPort: 10001
-                  name: client
-                - containerPort: 8000
-                  name: serve
-              volumeMounts:
-                - mountPath: /tmp/ray
-                  name: ray-logs
-              resources:
-                limits:
-                  cpu: 4
-                  memory: 20Gi
-                requests:
-                  cpu: 4
-                  memory: 20Gi
-              env:
-                - name: LD_LIBRARY_PATH
-                  value: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH"
+          - name: head
+            image: public.ecr.aws/data-on-eks/ray2.22.0-py310-llama2-13b-neuron:latest # Image created using the Dockerfile attached in the folder
+            imagePullPolicy: Always # Ensure the image is always pulled when updated
+            lifecycle:
+              preStop:
+                exec:
+                  command: ["/bin/sh", "-c", "ray stop"]
+            ports:
+            - containerPort: 6379
+              name: gcs
+            - containerPort: 8265
+              name: dashboard
+            - containerPort: 10001
+              name: client
+            - containerPort: 8000
+              name: serve
+            volumeMounts:
+            - mountPath: /tmp/ray
+              name: ray-logs
+            resources:
+              limits:
+                cpu: 4
+                memory: 20Gi
+              requests:
+                cpu: 4
+                memory: 20Gi
+            env:
+            - name: LD_LIBRARY_PATH
+              value: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH"
           nodeSelector: # This is using Karpenter Nodes with the provisioner label
             instanceType: mixed-x86
             provisionerType: Karpenter
             workload: rayhead
           volumes:
-            - name: ray-logs
-              emptyDir: {}
+          - name: ray-logs
+            emptyDir: {}
     workerGroupSpecs:
-      - groupName: inf2
-        replicas: 1
-        minReplicas: 1
-        maxReplicas: 1
-        rayStartParams: {}
-        template:
-          spec:
-            containers:
-              - name: worker
-                image: public.ecr.aws/data-on-eks/ray2.22.0-py310-llama2-13b-neuron:latest
-                imagePullPolicy: Always # Ensure the image is always pulled when updated
-                lifecycle:
-                  preStop:
-                    exec:
-                      command: ["/bin/sh", "-c", "ray stop"]
-                resources:
-                  limits:
-                    cpu: "180"
-                    memory: "700G"
-                    aws.amazon.com/neuron: "12"
-                  requests:
-                    cpu: "180"
-                    memory: "700G"
-                    aws.amazon.com/neuron: "12"
-                env:
-                  - name: LD_LIBRARY_PATH
-                    value: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH"
-            nodeSelector:
-              instanceType: inferentia-inf2
-              provisionerType: Karpenter
-            tolerations:
-              - key: "aws.amazon.com/neuron"
-                operator: "Exists"
-                effect: "NoSchedule"
-              - key: "hub.jupyter.org/dedicated"
-                operator: "Equal"
-                value: "user"
-                effect: "NoSchedule"
+    - groupName: inf2
+      replicas: 1
+      minReplicas: 1
+      maxReplicas: 1
+      rayStartParams: {}
+      template:
+        spec:
+          schedulerName: my-scheduler
+          containers:
+          - name: worker
+            image: public.ecr.aws/data-on-eks/ray2.22.0-py310-llama2-13b-neuron:latest
+            imagePullPolicy: Always # Ensure the image is always pulled when updated
+            lifecycle:
+              preStop:
+                exec:
+                  command: ["/bin/sh", "-c", "ray stop"]
+            resources:
+              limits:
+                cpu: "180"
+                memory: "700G"
+                aws.amazon.com/neuron: "12"
+              requests:
+                cpu: "180"
+                memory: "700G"
+                aws.amazon.com/neuron: "12"
+            env:
+            - name: LD_LIBRARY_PATH
+              value: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH"
+          nodeSelector:
+            instanceType: inferentia-inf2
+            provisionerType: Karpenter
+          tolerations:
+          - key: "aws.amazon.com/neuron"
+            operator: "Exists"
+            effect: "NoSchedule"
+          - key: "hub.jupyter.org/dedicated"
+            operator: "Equal"
+            value: "user"
+            effect: "NoSchedule"
 ---
 apiVersion: networking.k8s.io/v1
 kind: Ingress
@@ -137,21 +139,21 @@ metadata:
 spec:
   ingressClassName: nginx
   rules:
-    - http:
-        paths:
-          # Ray Dashboard
-          - path: /dashboard/(.*)
-            pathType: ImplementationSpecific
-            backend:
-              service:
-                name: llama2
-                port:
-                  number: 8265
-          # Ray Serve
-          - path: /serve/(.*)
-            pathType: ImplementationSpecific
-            backend:
-              service:
-                name: llama2
-                port:
-                  number: 8000
+  - http:
+      paths:
+      # Ray Dashboard
+      - path: /dashboard/(.*)
+        pathType: ImplementationSpecific
+        backend:
+          service:
+            name: llama2
+            port:
+              number: 8265
+      # Ray Serve
+      - path: /serve/(.*)
+        pathType: ImplementationSpecific
+        backend:
+          service:
+            name: llama2
+            port:
+              number: 8000
diff --git a/gen-ai/inference/llama3-8b-instruct-rayserve-inf2/ray-service-llama3.yaml b/gen-ai/inference/llama3-8b-instruct-rayserve-inf2/ray-service-llama3.yaml
@@ -37,6 +37,7 @@ spec:
         dashboard-host: '0.0.0.0'
       template:
         spec:
+          schedulerName: my-scheduler
           containers:
           - name: head
             image: public.ecr.aws/data-on-eks/ray-serve-inf2-llama3:latest # Image created using the Dockerfile attached in the folder
@@ -65,13 +66,13 @@ spec:
                 cpu: 4
                 memory: 20Gi
             env:
-              - name: HUGGING_FACE_HUB_TOKEN
-                valueFrom:
-                  secretKeyRef:
-                    name: hf-token
-                    key: hf-token
-              - name: LD_LIBRARY_PATH
-                value: "/home/ray/anaconda3/lib"
+            - name: HUGGING_FACE_HUB_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token
+                  key: hf-token
+            - name: LD_LIBRARY_PATH
+              value: "/home/ray/anaconda3/lib"
           nodeSelector: # This is using Karpenter Nodes with the provisioner label
             instanceType: mixed-x86
             provisionerType: Karpenter
@@ -87,6 +88,7 @@ spec:
       rayStartParams: {}
       template:
         spec:
+          schedulerName: my-scheduler
           containers:
           - name: worker
             image: public.ecr.aws/data-on-eks/ray-serve-inf2-llama3:latest
@@ -105,13 +107,13 @@ spec:
                 memory: "700G"
                 aws.amazon.com/neuron: "12"
             env:
-              - name: LD_LIBRARY_PATH
-                value: /home/ray/anaconda3/lib
-              - name: HUGGING_FACE_HUB_TOKEN
-                valueFrom:
-                  secretKeyRef:
-                    name: hf-token
-                    key: hf-token
+            - name: LD_LIBRARY_PATH
+              value: /home/ray/anaconda3/lib
+            - name: HUGGING_FACE_HUB_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token
+                  key: hf-token
           nodeSelector:
             instanceType: inferentia-inf2
             provisionerType: Karpenter

diff --git a/gen-ai/inference/mistral-7b-rayserve-inf2/ray-service-mistral-ft.yaml b/gen-ai/inference/mistral-7b-rayserve-inf2/ray-service-mistral-ft.yaml
@@ -66,6 +66,7 @@ spec:
         num-cpus: "0" # this is to ensure no tasks or actors are scheduled on the head Pod
       template:
         spec:
+          schedulerName: my-scheduler
           containers:
           - name: head
             image: public.ecr.aws/data-on-eks/ray2.22.0-py310-mistral7b-neuron:latest
@@ -118,6 +119,7 @@ spec:
       rayStartParams: {}
       template:
         spec:
+          schedulerName: my-scheduler
           containers:
           - name: worker
             image: public.ecr.aws/data-on-eks/ray2.22.0-py310-mistral7b-neuron:latest

diff --git a/gen-ai/inference/mistral-7b-rayserve-inf2/ray-service-mistral.yaml b/gen-ai/inference/mistral-7b-rayserve-inf2/ray-service-mistral.yaml
@@ -64,6 +64,7 @@ spec:
         num-cpus: "0" # this is to ensure no tasks or actors are scheduled on the head Pod
       template:
         spec:
+          schedulerName: my-scheduler
           containers:
           - name: head
             image: public.ecr.aws/data-on-eks/ray2.22.0-py310-mistral7b-neuron:latest
@@ -114,6 +115,7 @@ spec:
       rayStartParams: {}
       template:
         spec:
+          schedulerName: my-scheduler
           containers:
           - name: worker
             image: public.ecr.aws/data-on-eks/ray2.22.0-py310-mistral7b-neuron:latest

diff --git a/gen-ai/inference/stable-diffusion-xl-base-rayserve-inf2/ray-service-stablediffusion.yaml b/gen-ai/inference/stable-diffusion-xl-base-rayserve-inf2/ray-service-stablediffusion.yaml
@@ -50,6 +50,7 @@ spec:
         dashboard-host: '0.0.0.0'
       template:
         spec:
+          schedulerName: my-scheduler
           containers:
           - name: head
             image: public.ecr.aws/data-on-eks/ray2.9.0-py310-stablediffusion-neuron:latest
@@ -91,6 +92,7 @@ spec:
       rayStartParams: {}
       template:
         spec:
+          schedulerName: my-scheduler
           containers:
           - name: worker
             image: public.ecr.aws/data-on-eks/ray2.9.0-py310-stablediffusion-neuron:latest

diff --git a/website/docs/gen-ai/inference/Mistral-7b-inf2.md b/website/docs/gen-ai/inference/Mistral-7b-inf2.md
@@ -117,7 +117,7 @@ To deploy the Mistral-7B-Instruct-v0.2 model, it's essential to configure your H
 
 export HUGGING_FACE_HUB_TOKEN=$(echo -n "Your-Hugging-Face-Hub-Token-Value" | base64)
 
-cd ./../gen-ai/inference/mistral-7b-rayserve-inf2
+cd ../../gen-ai/inference/mistral-7b-rayserve-inf2
 envsubst < ray-service-mistral.yaml| kubectl apply -f -
 ```