From ff145308263203072229d4db79285919a21af42b Mon Sep 17 00:00:00 2001 From: Ratnopam Charabarti Date: Thu, 29 Aug 2024 23:02:55 -0700 Subject: [PATCH] feat: Neuron scheduler update for trainium-inferentia blueprints (#624) --- .../ray-service-llama2.yaml | 176 +++++++++--------- .../ray-service-llama3.yaml | 30 +-- .../ray-service-mistral-ft.yaml | 2 + .../ray-service-mistral.yaml | 2 + .../ray-service-stablediffusion.yaml | 2 + .../docs/gen-ai/inference/Mistral-7b-inf2.md | 2 +- 6 files changed, 112 insertions(+), 102 deletions(-) diff --git a/gen-ai/inference/llama2-13b-chat-rayserve-inf2/ray-service-llama2.yaml b/gen-ai/inference/llama2-13b-chat-rayserve-inf2/ray-service-llama2.yaml index c55ddb55b..40275f7ac 100644 --- a/gen-ai/inference/llama2-13b-chat-rayserve-inf2/ray-service-llama2.yaml +++ b/gen-ai/inference/llama2-13b-chat-rayserve-inf2/ray-service-llama2.yaml @@ -49,83 +49,85 @@ spec: rayStartParams: dashboard-host: '0.0.0.0' template: + schedulerName: my-scheduler spec: containers: - - name: head - image: public.ecr.aws/data-on-eks/ray2.22.0-py310-llama2-13b-neuron:latest # Image created using the Dockerfile attached in the folder - imagePullPolicy: Always # Ensure the image is always pulled when updated - lifecycle: - preStop: - exec: - command: ["/bin/sh", "-c", "ray stop"] - ports: - - containerPort: 6379 - name: gcs - - containerPort: 8265 - name: dashboard - - containerPort: 10001 - name: client - - containerPort: 8000 - name: serve - volumeMounts: - - mountPath: /tmp/ray - name: ray-logs - resources: - limits: - cpu: 4 - memory: 20Gi - requests: - cpu: 4 - memory: 20Gi - env: - - name: LD_LIBRARY_PATH - value: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH" + - name: head + image: public.ecr.aws/data-on-eks/ray2.22.0-py310-llama2-13b-neuron:latest # Image created using the Dockerfile attached in the folder + imagePullPolicy: Always # Ensure the image is always pulled when updated + lifecycle: + preStop: + exec: + command: ["/bin/sh", "-c", "ray stop"] + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + - containerPort: 8000 + name: serve + volumeMounts: + - mountPath: /tmp/ray + name: ray-logs + resources: + limits: + cpu: 4 + memory: 20Gi + requests: + cpu: 4 + memory: 20Gi + env: + - name: LD_LIBRARY_PATH + value: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH" nodeSelector: # This is using Karpenter Nodes with the provisioner label instanceType: mixed-x86 provisionerType: Karpenter workload: rayhead volumes: - - name: ray-logs - emptyDir: {} + - name: ray-logs + emptyDir: {} workerGroupSpecs: - - groupName: inf2 - replicas: 1 - minReplicas: 1 - maxReplicas: 1 - rayStartParams: {} - template: - spec: - containers: - - name: worker - image: public.ecr.aws/data-on-eks/ray2.22.0-py310-llama2-13b-neuron:latest - imagePullPolicy: Always # Ensure the image is always pulled when updated - lifecycle: - preStop: - exec: - command: ["/bin/sh", "-c", "ray stop"] - resources: - limits: - cpu: "180" - memory: "700G" - aws.amazon.com/neuron: "12" - requests: - cpu: "180" - memory: "700G" - aws.amazon.com/neuron: "12" - env: - - name: LD_LIBRARY_PATH - value: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH" - nodeSelector: - instanceType: inferentia-inf2 - provisionerType: Karpenter - tolerations: - - key: "aws.amazon.com/neuron" - operator: "Exists" - effect: "NoSchedule" - - key: "hub.jupyter.org/dedicated" - operator: "Equal" - value: "user" - effect: "NoSchedule" + - groupName: inf2 + replicas: 1 + minReplicas: 1 + maxReplicas: 1 + rayStartParams: {} + template: + spec: + schedulerName: my-scheduler + containers: + - name: worker + image: public.ecr.aws/data-on-eks/ray2.22.0-py310-llama2-13b-neuron:latest + imagePullPolicy: Always # Ensure the image is always pulled when updated + lifecycle: + preStop: + exec: + command: ["/bin/sh", "-c", "ray stop"] + resources: + limits: + cpu: "180" + memory: "700G" + aws.amazon.com/neuron: "12" + requests: + cpu: "180" + memory: "700G" + aws.amazon.com/neuron: "12" + env: + - name: LD_LIBRARY_PATH + value: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH" + nodeSelector: + instanceType: inferentia-inf2 + provisionerType: Karpenter + tolerations: + - key: "aws.amazon.com/neuron" + operator: "Exists" + effect: "NoSchedule" + - key: "hub.jupyter.org/dedicated" + operator: "Equal" + value: "user" + effect: "NoSchedule" --- apiVersion: networking.k8s.io/v1 kind: Ingress @@ -137,21 +139,21 @@ metadata: spec: ingressClassName: nginx rules: - - http: - paths: - # Ray Dashboard - - path: /dashboard/(.*) - pathType: ImplementationSpecific - backend: - service: - name: llama2 - port: - number: 8265 - # Ray Serve - - path: /serve/(.*) - pathType: ImplementationSpecific - backend: - service: - name: llama2 - port: - number: 8000 + - http: + paths: + # Ray Dashboard + - path: /dashboard/(.*) + pathType: ImplementationSpecific + backend: + service: + name: llama2 + port: + number: 8265 + # Ray Serve + - path: /serve/(.*) + pathType: ImplementationSpecific + backend: + service: + name: llama2 + port: + number: 8000 diff --git a/gen-ai/inference/llama3-8b-instruct-rayserve-inf2/ray-service-llama3.yaml b/gen-ai/inference/llama3-8b-instruct-rayserve-inf2/ray-service-llama3.yaml index ae4447c9b..d3e4846f1 100644 --- a/gen-ai/inference/llama3-8b-instruct-rayserve-inf2/ray-service-llama3.yaml +++ b/gen-ai/inference/llama3-8b-instruct-rayserve-inf2/ray-service-llama3.yaml @@ -37,6 +37,7 @@ spec: dashboard-host: '0.0.0.0' template: spec: + schedulerName: my-scheduler containers: - name: head image: public.ecr.aws/data-on-eks/ray-serve-inf2-llama3:latest # Image created using the Dockerfile attached in the folder @@ -65,13 +66,13 @@ spec: cpu: 4 memory: 20Gi env: - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-token - key: hf-token - - name: LD_LIBRARY_PATH - value: "/home/ray/anaconda3/lib" + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-token + key: hf-token + - name: LD_LIBRARY_PATH + value: "/home/ray/anaconda3/lib" nodeSelector: # This is using Karpenter Nodes with the provisioner label instanceType: mixed-x86 provisionerType: Karpenter @@ -87,6 +88,7 @@ spec: rayStartParams: {} template: spec: + schedulerName: my-scheduler containers: - name: worker image: public.ecr.aws/data-on-eks/ray-serve-inf2-llama3:latest @@ -105,13 +107,13 @@ spec: memory: "700G" aws.amazon.com/neuron: "12" env: - - name: LD_LIBRARY_PATH - value: /home/ray/anaconda3/lib - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-token - key: hf-token + - name: LD_LIBRARY_PATH + value: /home/ray/anaconda3/lib + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-token + key: hf-token nodeSelector: instanceType: inferentia-inf2 provisionerType: Karpenter diff --git a/gen-ai/inference/mistral-7b-rayserve-inf2/ray-service-mistral-ft.yaml b/gen-ai/inference/mistral-7b-rayserve-inf2/ray-service-mistral-ft.yaml index c4401d032..95dc1f3fb 100644 --- a/gen-ai/inference/mistral-7b-rayserve-inf2/ray-service-mistral-ft.yaml +++ b/gen-ai/inference/mistral-7b-rayserve-inf2/ray-service-mistral-ft.yaml @@ -66,6 +66,7 @@ spec: num-cpus: "0" # this is to ensure no tasks or actors are scheduled on the head Pod template: spec: + schedulerName: my-scheduler containers: - name: head image: public.ecr.aws/data-on-eks/ray2.22.0-py310-mistral7b-neuron:latest @@ -118,6 +119,7 @@ spec: rayStartParams: {} template: spec: + schedulerName: my-scheduler containers: - name: worker image: public.ecr.aws/data-on-eks/ray2.22.0-py310-mistral7b-neuron:latest diff --git a/gen-ai/inference/mistral-7b-rayserve-inf2/ray-service-mistral.yaml b/gen-ai/inference/mistral-7b-rayserve-inf2/ray-service-mistral.yaml index fa729bcea..c6a897167 100644 --- a/gen-ai/inference/mistral-7b-rayserve-inf2/ray-service-mistral.yaml +++ b/gen-ai/inference/mistral-7b-rayserve-inf2/ray-service-mistral.yaml @@ -64,6 +64,7 @@ spec: num-cpus: "0" # this is to ensure no tasks or actors are scheduled on the head Pod template: spec: + schedulerName: my-scheduler containers: - name: head image: public.ecr.aws/data-on-eks/ray2.22.0-py310-mistral7b-neuron:latest @@ -114,6 +115,7 @@ spec: rayStartParams: {} template: spec: + schedulerName: my-scheduler containers: - name: worker image: public.ecr.aws/data-on-eks/ray2.22.0-py310-mistral7b-neuron:latest diff --git a/gen-ai/inference/stable-diffusion-xl-base-rayserve-inf2/ray-service-stablediffusion.yaml b/gen-ai/inference/stable-diffusion-xl-base-rayserve-inf2/ray-service-stablediffusion.yaml index e7802d669..aac23ea38 100644 --- a/gen-ai/inference/stable-diffusion-xl-base-rayserve-inf2/ray-service-stablediffusion.yaml +++ b/gen-ai/inference/stable-diffusion-xl-base-rayserve-inf2/ray-service-stablediffusion.yaml @@ -50,6 +50,7 @@ spec: dashboard-host: '0.0.0.0' template: spec: + schedulerName: my-scheduler containers: - name: head image: public.ecr.aws/data-on-eks/ray2.9.0-py310-stablediffusion-neuron:latest @@ -91,6 +92,7 @@ spec: rayStartParams: {} template: spec: + schedulerName: my-scheduler containers: - name: worker image: public.ecr.aws/data-on-eks/ray2.9.0-py310-stablediffusion-neuron:latest diff --git a/website/docs/gen-ai/inference/Mistral-7b-inf2.md b/website/docs/gen-ai/inference/Mistral-7b-inf2.md index e6b94fdd3..762965ada 100644 --- a/website/docs/gen-ai/inference/Mistral-7b-inf2.md +++ b/website/docs/gen-ai/inference/Mistral-7b-inf2.md @@ -117,7 +117,7 @@ To deploy the Mistral-7B-Instruct-v0.2 model, it's essential to configure your H export HUGGING_FACE_HUB_TOKEN=$(echo -n "Your-Hugging-Face-Hub-Token-Value" | base64) -cd ./../gen-ai/inference/mistral-7b-rayserve-inf2 +cd ../../gen-ai/inference/mistral-7b-rayserve-inf2 envsubst < ray-service-mistral.yaml| kubectl apply -f - ```