Skip to content

Commit

Permalink
Add vLLM example
Browse files Browse the repository at this point in the history
  • Loading branch information
Delaunay committed Oct 4, 2023
1 parent 543708c commit f980db7
Show file tree
Hide file tree
Showing 4 changed files with 108 additions and 0 deletions.
34 changes: 34 additions & 0 deletions docs/examples/llm/client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import openai

def parse_meta(comment):
data = dict()
if comment != "(null)":
items = comment.split("|")
for kv in items:
try:
k, v = kv.split("=", maxsplit=1)
data[k] = v
except:
pass

return data


def get_job_comment(name="inference_server.sh"):
command = ["squeue", "-h", f"--name={name}", '--format="%k"']

return subprocess.check_output(command, text=True).replace('"', "")


server = parse_meta(get_job_comment())

openai.api_key = "EMPTY"
openai.api_base = f"http://{server['host']}:{server['port']}/v1"


completion = openai.Completion.create(
model=server['model'],
prompt=args.prompt
)

print(completion)
59 changes: 59 additions & 0 deletions docs/examples/llm/inference_server.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#!/bin/bash

#
# Assume you have conda installed
#
# Usage:
#
# sbatch --ntasks-per-node=1 --mem=32G inference_server_SHARED.sh meta/Llama-2-7b-chat-hf
# sbatch --ntasks-per-node=2 --mem=64G inference_server_SHARED.sh meta/Llama-2-13b-chat-hf
# sbatch --ntasks-per-node=8 --mem=192G inference_server_SHARED.sh meta/Llama-2-70b-chat-hf
#

#SBATCH --gpus-per-task=rtx8000:1
#SBATCH --cpus-per-task=4
#SBATCH --time=00:15:00
#SBATCH --ntasks-per-node=1
#SBATCH --mem=32G

MODEL="$1"
PATH="$2"

export MILA_WEIGHTS="/network/weights/"

cd $SLURM_TMPDIR

#
# Fix problem with conda saying it is not "init properly"
#
CONDA_EXEC="$(which conda)"
CONDA_BASE=$(dirname $CONDA_EXEC)
source $CONDA_BASE/../etc/profile.d/conda.sh

#
# Create a new environment
#
conda create --prefix ./env python=3.9 -y
conda activate ./env
pip install vllm

#
# Save metadata for retrival
#

PORT=$(python -c "import socket; sock = socket.socket(); sock.bind(('', 0)); print(sock.getsockname()[1])")
HOST="$(hostname)"
NAME="$WEIGHTS/$MODEL"

echo " -> $HOST:$PORT"
scontrol update job $SLURM_JOB_ID comment="model=$MODEL|host=$HOST|port=$PORT|shared=y"

#
# Launch Server
#
python -m vllm.entrypoints.openai.api_server \
--host $HOST \
--port $PORT \
--model "$MODEL" \
--tensor-parallel-size $SLURM_NTASKS_PER_NODE \
--served-model-name "$MODEL"
2 changes: 2 additions & 0 deletions docs/examples/llm/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
vllm
openai
13 changes: 13 additions & 0 deletions docs/examples/llm/vllm.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
LLM Inference
=============



Dependencies
------------

.. code-block::
sbatch inference_server.sh Llama-2-7b-chat-hf /network/weights/llama.var/llama2/Llama-2-7b-chat-hf

0 comments on commit f980db7

Please sign in to comment.