llm/gemma/serve.yaml

# A example yaml for serving Gemma model from Google with an OpenAI API.
# Usage:
#  1. Launch on a single instance: `sky launch -c gemma ./serve.yaml`
#  2. Scale up to multiple instances with a single endpoint:
#     `sky serve up -n gemma ./serve.yaml`
service:
  readiness_probe:
    path: /v1/chat/completions
    post_data:
      model: $MODEL_NAME
      messages:
        - role: user
          content: Hello! What is your name?
      max_tokens: 1
    initial_delay_seconds: 1200
  replicas: 2

envs:
  MODEL_NAME: google/gemma-7b-it
  HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.

resources: 
  accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB}
  ports: 8000
  disk_tier: best

setup: |
  conda activate gemma
  if [ $? -ne 0 ]; then
    conda create -n gemma -y python=3.10
    conda activate gemma
  fi
  pip install vllm==0.3.2
  pip install transformers==4.38.1
  python -c "import huggingface_hub; huggingface_hub.login('${HF_TOKEN}')"

run: |
  conda activate gemma
  export PATH=$PATH:/sbin
  # --max-model-len is set to 1024 to avoid taking too much GPU memory on L4 and
  # A10g with small memory.
  python -u -m vllm.entrypoints.openai.api_server \
    --host 0.0.0.0 \
    --model $MODEL_NAME \
    --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
    --max-model-len 1024 | tee ~/openai_api_server.log