Skip to content

Commit

Permalink
feat: Neuron scheduler update for trainium-inferentia blueprints (#624)
Browse files Browse the repository at this point in the history
  • Loading branch information
ratnopamc authored Aug 30, 2024
1 parent 6a92682 commit ff14530
Show file tree
Hide file tree
Showing 6 changed files with 112 additions and 102 deletions.
176 changes: 89 additions & 87 deletions gen-ai/inference/llama2-13b-chat-rayserve-inf2/ray-service-llama2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,83 +49,85 @@ spec:
rayStartParams:
dashboard-host: '0.0.0.0'
template:
schedulerName: my-scheduler
spec:
containers:
- name: head
image: public.ecr.aws/data-on-eks/ray2.22.0-py310-llama2-13b-neuron:latest # Image created using the Dockerfile attached in the folder
imagePullPolicy: Always # Ensure the image is always pulled when updated
lifecycle:
preStop:
exec:
command: ["/bin/sh", "-c", "ray stop"]
ports:
- containerPort: 6379
name: gcs
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
- containerPort: 8000
name: serve
volumeMounts:
- mountPath: /tmp/ray
name: ray-logs
resources:
limits:
cpu: 4
memory: 20Gi
requests:
cpu: 4
memory: 20Gi
env:
- name: LD_LIBRARY_PATH
value: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH"
- name: head
image: public.ecr.aws/data-on-eks/ray2.22.0-py310-llama2-13b-neuron:latest # Image created using the Dockerfile attached in the folder
imagePullPolicy: Always # Ensure the image is always pulled when updated
lifecycle:
preStop:
exec:
command: ["/bin/sh", "-c", "ray stop"]
ports:
- containerPort: 6379
name: gcs
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
- containerPort: 8000
name: serve
volumeMounts:
- mountPath: /tmp/ray
name: ray-logs
resources:
limits:
cpu: 4
memory: 20Gi
requests:
cpu: 4
memory: 20Gi
env:
- name: LD_LIBRARY_PATH
value: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH"
nodeSelector: # This is using Karpenter Nodes with the provisioner label
instanceType: mixed-x86
provisionerType: Karpenter
workload: rayhead
volumes:
- name: ray-logs
emptyDir: {}
- name: ray-logs
emptyDir: {}
workerGroupSpecs:
- groupName: inf2
replicas: 1
minReplicas: 1
maxReplicas: 1
rayStartParams: {}
template:
spec:
containers:
- name: worker
image: public.ecr.aws/data-on-eks/ray2.22.0-py310-llama2-13b-neuron:latest
imagePullPolicy: Always # Ensure the image is always pulled when updated
lifecycle:
preStop:
exec:
command: ["/bin/sh", "-c", "ray stop"]
resources:
limits:
cpu: "180"
memory: "700G"
aws.amazon.com/neuron: "12"
requests:
cpu: "180"
memory: "700G"
aws.amazon.com/neuron: "12"
env:
- name: LD_LIBRARY_PATH
value: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH"
nodeSelector:
instanceType: inferentia-inf2
provisionerType: Karpenter
tolerations:
- key: "aws.amazon.com/neuron"
operator: "Exists"
effect: "NoSchedule"
- key: "hub.jupyter.org/dedicated"
operator: "Equal"
value: "user"
effect: "NoSchedule"
- groupName: inf2
replicas: 1
minReplicas: 1
maxReplicas: 1
rayStartParams: {}
template:
spec:
schedulerName: my-scheduler
containers:
- name: worker
image: public.ecr.aws/data-on-eks/ray2.22.0-py310-llama2-13b-neuron:latest
imagePullPolicy: Always # Ensure the image is always pulled when updated
lifecycle:
preStop:
exec:
command: ["/bin/sh", "-c", "ray stop"]
resources:
limits:
cpu: "180"
memory: "700G"
aws.amazon.com/neuron: "12"
requests:
cpu: "180"
memory: "700G"
aws.amazon.com/neuron: "12"
env:
- name: LD_LIBRARY_PATH
value: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH"
nodeSelector:
instanceType: inferentia-inf2
provisionerType: Karpenter
tolerations:
- key: "aws.amazon.com/neuron"
operator: "Exists"
effect: "NoSchedule"
- key: "hub.jupyter.org/dedicated"
operator: "Equal"
value: "user"
effect: "NoSchedule"
---
apiVersion: networking.k8s.io/v1
kind: Ingress
Expand All @@ -137,21 +139,21 @@ metadata:
spec:
ingressClassName: nginx
rules:
- http:
paths:
# Ray Dashboard
- path: /dashboard/(.*)
pathType: ImplementationSpecific
backend:
service:
name: llama2
port:
number: 8265
# Ray Serve
- path: /serve/(.*)
pathType: ImplementationSpecific
backend:
service:
name: llama2
port:
number: 8000
- http:
paths:
# Ray Dashboard
- path: /dashboard/(.*)
pathType: ImplementationSpecific
backend:
service:
name: llama2
port:
number: 8265
# Ray Serve
- path: /serve/(.*)
pathType: ImplementationSpecific
backend:
service:
name: llama2
port:
number: 8000
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ spec:
dashboard-host: '0.0.0.0'
template:
spec:
schedulerName: my-scheduler
containers:
- name: head
image: public.ecr.aws/data-on-eks/ray-serve-inf2-llama3:latest # Image created using the Dockerfile attached in the folder
Expand Down Expand Up @@ -65,13 +66,13 @@ spec:
cpu: 4
memory: 20Gi
env:
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-token
key: hf-token
- name: LD_LIBRARY_PATH
value: "/home/ray/anaconda3/lib"
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-token
key: hf-token
- name: LD_LIBRARY_PATH
value: "/home/ray/anaconda3/lib"
nodeSelector: # This is using Karpenter Nodes with the provisioner label
instanceType: mixed-x86
provisionerType: Karpenter
Expand All @@ -87,6 +88,7 @@ spec:
rayStartParams: {}
template:
spec:
schedulerName: my-scheduler
containers:
- name: worker
image: public.ecr.aws/data-on-eks/ray-serve-inf2-llama3:latest
Expand All @@ -105,13 +107,13 @@ spec:
memory: "700G"
aws.amazon.com/neuron: "12"
env:
- name: LD_LIBRARY_PATH
value: /home/ray/anaconda3/lib
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-token
key: hf-token
- name: LD_LIBRARY_PATH
value: /home/ray/anaconda3/lib
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-token
key: hf-token
nodeSelector:
instanceType: inferentia-inf2
provisionerType: Karpenter
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ spec:
num-cpus: "0" # this is to ensure no tasks or actors are scheduled on the head Pod
template:
spec:
schedulerName: my-scheduler
containers:
- name: head
image: public.ecr.aws/data-on-eks/ray2.22.0-py310-mistral7b-neuron:latest
Expand Down Expand Up @@ -118,6 +119,7 @@ spec:
rayStartParams: {}
template:
spec:
schedulerName: my-scheduler
containers:
- name: worker
image: public.ecr.aws/data-on-eks/ray2.22.0-py310-mistral7b-neuron:latest
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ spec:
num-cpus: "0" # this is to ensure no tasks or actors are scheduled on the head Pod
template:
spec:
schedulerName: my-scheduler
containers:
- name: head
image: public.ecr.aws/data-on-eks/ray2.22.0-py310-mistral7b-neuron:latest
Expand Down Expand Up @@ -114,6 +115,7 @@ spec:
rayStartParams: {}
template:
spec:
schedulerName: my-scheduler
containers:
- name: worker
image: public.ecr.aws/data-on-eks/ray2.22.0-py310-mistral7b-neuron:latest
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ spec:
dashboard-host: '0.0.0.0'
template:
spec:
schedulerName: my-scheduler
containers:
- name: head
image: public.ecr.aws/data-on-eks/ray2.9.0-py310-stablediffusion-neuron:latest
Expand Down Expand Up @@ -91,6 +92,7 @@ spec:
rayStartParams: {}
template:
spec:
schedulerName: my-scheduler
containers:
- name: worker
image: public.ecr.aws/data-on-eks/ray2.9.0-py310-stablediffusion-neuron:latest
Expand Down
2 changes: 1 addition & 1 deletion website/docs/gen-ai/inference/Mistral-7b-inf2.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ To deploy the Mistral-7B-Instruct-v0.2 model, it's essential to configure your H

export HUGGING_FACE_HUB_TOKEN=$(echo -n "Your-Hugging-Face-Hub-Token-Value" | base64)

cd ./../gen-ai/inference/mistral-7b-rayserve-inf2
cd ../../gen-ai/inference/mistral-7b-rayserve-inf2
envsubst < ray-service-mistral.yaml| kubectl apply -f -
```

Expand Down

0 comments on commit ff14530

Please sign in to comment.