forked from skypilot-org/skypilot
-
Notifications
You must be signed in to change notification settings - Fork 0
/
serve.yaml
47 lines (42 loc) · 1.34 KB
/
serve.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# A example yaml for serving Gemma model from Google with an OpenAI API.
# Usage:
# 1. Launch on a single instance: `sky launch -c gemma ./serve.yaml`
# 2. Scale up to multiple instances with a single endpoint:
# `sky serve up -n gemma ./serve.yaml`
service:
readiness_probe:
path: /v1/chat/completions
post_data:
model: $MODEL_NAME
messages:
- role: user
content: Hello! What is your name?
max_tokens: 1
initial_delay_seconds: 1200
replicas: 2
envs:
MODEL_NAME: google/gemma-7b-it
HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
resources:
accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB}
ports: 8000
disk_tier: best
setup: |
conda activate gemma
if [ $? -ne 0 ]; then
conda create -n gemma -y python=3.10
conda activate gemma
fi
pip install vllm==0.3.2
pip install transformers==4.38.1
python -c "import huggingface_hub; huggingface_hub.login('${HF_TOKEN}')"
run: |
conda activate gemma
export PATH=$PATH:/sbin
# --max-model-len is set to 1024 to avoid taking too much GPU memory on L4 and
# A10g with small memory.
python -u -m vllm.entrypoints.openai.api_server \
--host 0.0.0.0 \
--model $MODEL_NAME \
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
--max-model-len 1024 | tee ~/openai_api_server.log