From f980db7f8a7a9f58facac05404a21de92ba445b9 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Wed, 4 Oct 2023 13:18:23 +0000 Subject: [PATCH] Add vLLM example --- docs/examples/llm/client.py | 34 +++++++++++++++ docs/examples/llm/inference_server.sh | 59 +++++++++++++++++++++++++++ docs/examples/llm/requirements.txt | 2 + docs/examples/llm/vllm.rst | 13 ++++++ 4 files changed, 108 insertions(+) create mode 100644 docs/examples/llm/client.py create mode 100644 docs/examples/llm/inference_server.sh create mode 100644 docs/examples/llm/requirements.txt create mode 100644 docs/examples/llm/vllm.rst diff --git a/docs/examples/llm/client.py b/docs/examples/llm/client.py new file mode 100644 index 00000000..63cabf5f --- /dev/null +++ b/docs/examples/llm/client.py @@ -0,0 +1,34 @@ +import openai + +def parse_meta(comment): + data = dict() + if comment != "(null)": + items = comment.split("|") + for kv in items: + try: + k, v = kv.split("=", maxsplit=1) + data[k] = v + except: + pass + + return data + + +def get_job_comment(name="inference_server.sh"): + command = ["squeue", "-h", f"--name={name}", '--format="%k"'] + + return subprocess.check_output(command, text=True).replace('"', "") + + +server = parse_meta(get_job_comment()) + +openai.api_key = "EMPTY" +openai.api_base = f"http://{server['host']}:{server['port']}/v1" + + +completion = openai.Completion.create( + model=server['model'], + prompt=args.prompt +) + +print(completion) \ No newline at end of file diff --git a/docs/examples/llm/inference_server.sh b/docs/examples/llm/inference_server.sh new file mode 100644 index 00000000..985a4ee5 --- /dev/null +++ b/docs/examples/llm/inference_server.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +# +# Assume you have conda installed +# +# Usage: +# +# sbatch --ntasks-per-node=1 --mem=32G inference_server_SHARED.sh meta/Llama-2-7b-chat-hf +# sbatch --ntasks-per-node=2 --mem=64G inference_server_SHARED.sh meta/Llama-2-13b-chat-hf +# sbatch --ntasks-per-node=8 --mem=192G inference_server_SHARED.sh meta/Llama-2-70b-chat-hf +# + +#SBATCH --gpus-per-task=rtx8000:1 +#SBATCH --cpus-per-task=4 +#SBATCH --time=00:15:00 +#SBATCH --ntasks-per-node=1 +#SBATCH --mem=32G + +MODEL="$1" +PATH="$2" + +export MILA_WEIGHTS="/network/weights/" + +cd $SLURM_TMPDIR + +# +# Fix problem with conda saying it is not "init properly" +# +CONDA_EXEC="$(which conda)" +CONDA_BASE=$(dirname $CONDA_EXEC) +source $CONDA_BASE/../etc/profile.d/conda.sh + +# +# Create a new environment +# +conda create --prefix ./env python=3.9 -y +conda activate ./env +pip install vllm + +# +# Save metadata for retrival +# + +PORT=$(python -c "import socket; sock = socket.socket(); sock.bind(('', 0)); print(sock.getsockname()[1])") +HOST="$(hostname)" +NAME="$WEIGHTS/$MODEL" + +echo " -> $HOST:$PORT" +scontrol update job $SLURM_JOB_ID comment="model=$MODEL|host=$HOST|port=$PORT|shared=y" + +# +# Launch Server +# +python -m vllm.entrypoints.openai.api_server \ + --host $HOST \ + --port $PORT \ + --model "$MODEL" \ + --tensor-parallel-size $SLURM_NTASKS_PER_NODE \ + --served-model-name "$MODEL" diff --git a/docs/examples/llm/requirements.txt b/docs/examples/llm/requirements.txt new file mode 100644 index 00000000..fc04a682 --- /dev/null +++ b/docs/examples/llm/requirements.txt @@ -0,0 +1,2 @@ +vllm +openai diff --git a/docs/examples/llm/vllm.rst b/docs/examples/llm/vllm.rst new file mode 100644 index 00000000..0715ce2c --- /dev/null +++ b/docs/examples/llm/vllm.rst @@ -0,0 +1,13 @@ +LLM Inference +============= + + + +Dependencies +------------ + +.. code-block:: + + sbatch inference_server.sh Llama-2-7b-chat-hf /network/weights/llama.var/llama2/Llama-2-7b-chat-hf + +