From f980db7f8a7a9f58facac05404a21de92ba445b9 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Wed, 4 Oct 2023 13:18:23 +0000 Subject: [PATCH 1/4] Add vLLM example --- docs/examples/llm/client.py | 34 +++++++++++++++ docs/examples/llm/inference_server.sh | 59 +++++++++++++++++++++++++++ docs/examples/llm/requirements.txt | 2 + docs/examples/llm/vllm.rst | 13 ++++++ 4 files changed, 108 insertions(+) create mode 100644 docs/examples/llm/client.py create mode 100644 docs/examples/llm/inference_server.sh create mode 100644 docs/examples/llm/requirements.txt create mode 100644 docs/examples/llm/vllm.rst diff --git a/docs/examples/llm/client.py b/docs/examples/llm/client.py new file mode 100644 index 00000000..63cabf5f --- /dev/null +++ b/docs/examples/llm/client.py @@ -0,0 +1,34 @@ +import openai + +def parse_meta(comment): + data = dict() + if comment != "(null)": + items = comment.split("|") + for kv in items: + try: + k, v = kv.split("=", maxsplit=1) + data[k] = v + except: + pass + + return data + + +def get_job_comment(name="inference_server.sh"): + command = ["squeue", "-h", f"--name={name}", '--format="%k"'] + + return subprocess.check_output(command, text=True).replace('"', "") + + +server = parse_meta(get_job_comment()) + +openai.api_key = "EMPTY" +openai.api_base = f"http://{server['host']}:{server['port']}/v1" + + +completion = openai.Completion.create( + model=server['model'], + prompt=args.prompt +) + +print(completion) \ No newline at end of file diff --git a/docs/examples/llm/inference_server.sh b/docs/examples/llm/inference_server.sh new file mode 100644 index 00000000..985a4ee5 --- /dev/null +++ b/docs/examples/llm/inference_server.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +# +# Assume you have conda installed +# +# Usage: +# +# sbatch --ntasks-per-node=1 --mem=32G inference_server_SHARED.sh meta/Llama-2-7b-chat-hf +# sbatch --ntasks-per-node=2 --mem=64G inference_server_SHARED.sh meta/Llama-2-13b-chat-hf +# sbatch --ntasks-per-node=8 --mem=192G inference_server_SHARED.sh meta/Llama-2-70b-chat-hf +# + +#SBATCH --gpus-per-task=rtx8000:1 +#SBATCH --cpus-per-task=4 +#SBATCH --time=00:15:00 +#SBATCH --ntasks-per-node=1 +#SBATCH --mem=32G + +MODEL="$1" +PATH="$2" + +export MILA_WEIGHTS="/network/weights/" + +cd $SLURM_TMPDIR + +# +# Fix problem with conda saying it is not "init properly" +# +CONDA_EXEC="$(which conda)" +CONDA_BASE=$(dirname $CONDA_EXEC) +source $CONDA_BASE/../etc/profile.d/conda.sh + +# +# Create a new environment +# +conda create --prefix ./env python=3.9 -y +conda activate ./env +pip install vllm + +# +# Save metadata for retrival +# + +PORT=$(python -c "import socket; sock = socket.socket(); sock.bind(('', 0)); print(sock.getsockname()[1])") +HOST="$(hostname)" +NAME="$WEIGHTS/$MODEL" + +echo " -> $HOST:$PORT" +scontrol update job $SLURM_JOB_ID comment="model=$MODEL|host=$HOST|port=$PORT|shared=y" + +# +# Launch Server +# +python -m vllm.entrypoints.openai.api_server \ + --host $HOST \ + --port $PORT \ + --model "$MODEL" \ + --tensor-parallel-size $SLURM_NTASKS_PER_NODE \ + --served-model-name "$MODEL" diff --git a/docs/examples/llm/requirements.txt b/docs/examples/llm/requirements.txt new file mode 100644 index 00000000..fc04a682 --- /dev/null +++ b/docs/examples/llm/requirements.txt @@ -0,0 +1,2 @@ +vllm +openai diff --git a/docs/examples/llm/vllm.rst b/docs/examples/llm/vllm.rst new file mode 100644 index 00000000..0715ce2c --- /dev/null +++ b/docs/examples/llm/vllm.rst @@ -0,0 +1,13 @@ +LLM Inference +============= + + + +Dependencies +------------ + +.. code-block:: + + sbatch inference_server.sh Llama-2-7b-chat-hf /network/weights/llama.var/llama2/Llama-2-7b-chat-hf + + From 4d3c06ad7abff7f0611f8d019014450e54cccd14 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Wed, 4 Oct 2023 13:41:26 +0000 Subject: [PATCH 2/4] Add walk through --- docs/examples/llm/client.py | 6 +++ docs/examples/llm/inference_server.sh | 55 ++++++++++++++++++++++----- docs/examples/llm/vllm.rst | 52 +++++++++++++++++++++++-- 3 files changed, 101 insertions(+), 12 deletions(-) diff --git a/docs/examples/llm/client.py b/docs/examples/llm/client.py index 63cabf5f..86e3ce7e 100644 --- a/docs/examples/llm/client.py +++ b/docs/examples/llm/client.py @@ -1,5 +1,9 @@ import openai + +# +# Parse the server info from the job comment +# def parse_meta(comment): data = dict() if comment != "(null)": @@ -22,10 +26,12 @@ def get_job_comment(name="inference_server.sh"): server = parse_meta(get_job_comment()) +# Override OpenAPI API URL with out custom server openai.api_key = "EMPTY" openai.api_base = f"http://{server['host']}:{server['port']}/v1" +# profit completion = openai.Completion.create( model=server['model'], prompt=args.prompt diff --git a/docs/examples/llm/inference_server.sh b/docs/examples/llm/inference_server.sh index 985a4ee5..acf2a857 100644 --- a/docs/examples/llm/inference_server.sh +++ b/docs/examples/llm/inference_server.sh @@ -16,8 +16,45 @@ #SBATCH --ntasks-per-node=1 #SBATCH --mem=32G -MODEL="$1" -PATH="$2" +usage() { + echo "Usage: $0 [-m] [-p] + echo " -h Display this help message." + echo " -m MODEL Specify a file to process." + echo " -p PATH Specify a directory to work in." + echo " ARGUMENT Any additional argument you want to process." + exit 1 +} + +MODEL="" +PATH="" +ENV="./env" + + +while getopts ":hf:d:" opt; do + case $opt in + h) + usage + ;; + m) + MODEL="$OPTARG" + ;; + p) + PATH="$OPTARG" + ;; + e) + ENV="$OPTARG" + ;; + \?) + echo "Invalid option: -$OPTARG" >&2 + usage + ;; + :) + echo "Option -$OPTARG requires an argument." >&2 + usage + ;; + esac +done + export MILA_WEIGHTS="/network/weights/" @@ -33,19 +70,19 @@ source $CONDA_BASE/../etc/profile.d/conda.sh # # Create a new environment # -conda create --prefix ./env python=3.9 -y -conda activate ./env +if [ ! -d "$ENV" ]; then + conda create --prefix $ENV python=3.9 -y +fi +conda activate $ENV pip install vllm -# -# Save metadata for retrival -# - PORT=$(python -c "import socket; sock = socket.socket(); sock.bind(('', 0)); print(sock.getsockname()[1])") HOST="$(hostname)" NAME="$WEIGHTS/$MODEL" -echo " -> $HOST:$PORT" +# +# Save metadata for retrival +# scontrol update job $SLURM_JOB_ID comment="model=$MODEL|host=$HOST|port=$PORT|shared=y" # diff --git a/docs/examples/llm/vllm.rst b/docs/examples/llm/vllm.rst index 0715ce2c..e55c1291 100644 --- a/docs/examples/llm/vllm.rst +++ b/docs/examples/llm/vllm.rst @@ -2,12 +2,58 @@ LLM Inference ============= +Server +------ + +`vLLM `_ comes with its own server entry point that mimicks OpenAI's API. +It is very easy to setup and supports a wide range of models through Huggingfaces. + + +.. code-block:: + + # sbatch inference_server.sh -m MODEL_NAME -p WEIGHT_PATH -e CONDA_ENV_NAME_TO_USE + sbatch inference_server.sh -m Llama-2-7b-chat-hf -p /network/weights/llama.var/llama2/Llama-2-7b-chat-hf -e base + + +By default the script will launch the server on an rtx8000 for 15 minutes. +You can override the defaults by specifying arguments to sbatch. -Dependencies ------------- .. code-block:: - sbatch inference_server.sh Llama-2-7b-chat-hf /network/weights/llama.var/llama2/Llama-2-7b-chat-hf + sbatch --time=00:30:00 inference_server.sh -m Llama-2-7b-chat-hf -p /network/weights/llama.var/llama2/Llama-2-7b-chat-hf -e base + +.. note:: + + We are using job comment to store hostname, port and model names, + which enable the client to automatically pick them up on its side. + + +.. literalinclude:: inference_server.sh + :language: bash + + +Client +------ + +Becasue vLLM replicates OpenAI's API, the client side is quite straight forward. +Own OpenAI's client can be reused. + +.. warning:: + + The server takes a while to setup you might to have to wait a few minutes + before the server is ready for inference. + + You can check the job log of the server. + Look for + + +.. note:: + + We use squeue to look for the inference server job to configure the + url endpoint automatically. + Make sure your job name is unique! +.. literalinclude:: client.py + :language: python From 7d16b5be1470cf034a5ad780082033891ff3a749 Mon Sep 17 00:00:00 2001 From: "pierre.delaunay" Date: Wed, 4 Oct 2023 11:00:19 -0400 Subject: [PATCH 3/4] Tweaks --- docs/examples/llm/client.py | 8 +++++--- docs/examples/llm/inference_server.sh | 22 +++++++++++++--------- docs/examples/llm/vllm.rst | 20 ++++++++++++-------- 3 files changed, 30 insertions(+), 20 deletions(-) diff --git a/docs/examples/llm/client.py b/docs/examples/llm/client.py index 86e3ce7e..756761ae 100644 --- a/docs/examples/llm/client.py +++ b/docs/examples/llm/client.py @@ -1,3 +1,5 @@ +import subprocess + import openai @@ -33,8 +35,8 @@ def get_job_comment(name="inference_server.sh"): # profit completion = openai.Completion.create( - model=server['model'], - prompt=args.prompt + model=server['model'], + prompt="What is the square root of 25 ?" ) -print(completion) \ No newline at end of file +print(completion) diff --git a/docs/examples/llm/inference_server.sh b/docs/examples/llm/inference_server.sh index acf2a857..fdc1f69c 100644 --- a/docs/examples/llm/inference_server.sh +++ b/docs/examples/llm/inference_server.sh @@ -16,21 +16,22 @@ #SBATCH --ntasks-per-node=1 #SBATCH --mem=32G -usage() { - echo "Usage: $0 [-m] [-p] +function usage() { + echo "Usage: $0 [-m] [-p]" echo " -h Display this help message." echo " -m MODEL Specify a file to process." echo " -p PATH Specify a directory to work in." + echo " -e ENV Specify the conda environementt to use." echo " ARGUMENT Any additional argument you want to process." exit 1 } MODEL="" -PATH="" +MODEL_PATH="" ENV="./env" -while getopts ":hf:d:" opt; do +while getopts ":hm:p:e:" opt; do case $opt in h) usage @@ -39,7 +40,7 @@ while getopts ":hf:d:" opt; do MODEL="$OPTARG" ;; p) - PATH="$OPTARG" + MODEL_PATH="$OPTARG" ;; e) ENV="$OPTARG" @@ -55,9 +56,11 @@ while getopts ":hf:d:" opt; do esac done +echo "model: $MODEL" +echo " path: $MODEL_PATH" +echo " env: $ENV" export MILA_WEIGHTS="/network/weights/" - cd $SLURM_TMPDIR # @@ -65,12 +68,13 @@ cd $SLURM_TMPDIR # CONDA_EXEC="$(which conda)" CONDA_BASE=$(dirname $CONDA_EXEC) +CONDA_ENVS="$CONDA_BASE/../envs" source $CONDA_BASE/../etc/profile.d/conda.sh # # Create a new environment # -if [ ! -d "$ENV" ]; then +if [ ! -d "$ENV" ] && [ "$ENV" != "base" ] && [ ! -d "$CONDA_ENVS/$ENV" ]; then conda create --prefix $ENV python=3.9 -y fi conda activate $ENV @@ -85,12 +89,12 @@ NAME="$WEIGHTS/$MODEL" # scontrol update job $SLURM_JOB_ID comment="model=$MODEL|host=$HOST|port=$PORT|shared=y" -# +# # Launch Server # python -m vllm.entrypoints.openai.api_server \ --host $HOST \ --port $PORT \ - --model "$MODEL" \ + --model "$MODEL_PATH" \ --tensor-parallel-size $SLURM_NTASKS_PER_NODE \ --served-model-name "$MODEL" diff --git a/docs/examples/llm/vllm.rst b/docs/examples/llm/vllm.rst index e55c1291..b6501969 100644 --- a/docs/examples/llm/vllm.rst +++ b/docs/examples/llm/vllm.rst @@ -9,7 +9,7 @@ Server It is very easy to setup and supports a wide range of models through Huggingfaces. -.. code-block:: +.. code-block:: # sbatch inference_server.sh -m MODEL_NAME -p WEIGHT_PATH -e CONDA_ENV_NAME_TO_USE sbatch inference_server.sh -m Llama-2-7b-chat-hf -p /network/weights/llama.var/llama2/Llama-2-7b-chat-hf -e base @@ -19,7 +19,7 @@ By default the script will launch the server on an rtx8000 for 15 minutes. You can override the defaults by specifying arguments to sbatch. -.. code-block:: +.. code-block:: sbatch --time=00:30:00 inference_server.sh -m Llama-2-7b-chat-hf -p /network/weights/llama.var/llama2/Llama-2-7b-chat-hf -e base @@ -36,24 +36,28 @@ You can override the defaults by specifying arguments to sbatch. Client ------ -Becasue vLLM replicates OpenAI's API, the client side is quite straight forward. -Own OpenAI's client can be reused. +Because vLLM replicates OpenAI's API, the client side is quite straight forward and +own OpenAI's client can be reused. .. warning:: - + The server takes a while to setup you might to have to wait a few minutes before the server is ready for inference. - You can check the job log of the server. - Look for + You can check the job log of the server using ``tail -f slurm-.out`` to + see the log as it is written. + + Look for ``Uvicorn running on http://... (Press CTRL+C to quit)`` + to know when the server is ready to receive requests. .. note:: - We use squeue to look for the inference server job to configure the + We use ``squeue`` to look for the inference server job to configure the url endpoint automatically. Make sure your job name is unique! + .. literalinclude:: client.py :language: python From c67719325a06f0c5c1dd79a514057bf83e586a81 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Wed, 4 Oct 2023 15:11:36 +0000 Subject: [PATCH 4/4] Remove out-dated comment --- docs/examples/llm/inference_server.sh | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/docs/examples/llm/inference_server.sh b/docs/examples/llm/inference_server.sh index fdc1f69c..350e69a1 100644 --- a/docs/examples/llm/inference_server.sh +++ b/docs/examples/llm/inference_server.sh @@ -1,15 +1,5 @@ #!/bin/bash -# -# Assume you have conda installed -# -# Usage: -# -# sbatch --ntasks-per-node=1 --mem=32G inference_server_SHARED.sh meta/Llama-2-7b-chat-hf -# sbatch --ntasks-per-node=2 --mem=64G inference_server_SHARED.sh meta/Llama-2-13b-chat-hf -# sbatch --ntasks-per-node=8 --mem=192G inference_server_SHARED.sh meta/Llama-2-70b-chat-hf -# - #SBATCH --gpus-per-task=rtx8000:1 #SBATCH --cpus-per-task=4 #SBATCH --time=00:15:00 @@ -19,8 +9,8 @@ function usage() { echo "Usage: $0 [-m] [-p]" echo " -h Display this help message." - echo " -m MODEL Specify a file to process." - echo " -p PATH Specify a directory to work in." + echo " -m MODEL Specify the model name" + echo " -p PATH Specify the model weights" echo " -e ENV Specify the conda environementt to use." echo " ARGUMENT Any additional argument you want to process." exit 1 @@ -75,7 +65,7 @@ source $CONDA_BASE/../etc/profile.d/conda.sh # Create a new environment # if [ ! -d "$ENV" ] && [ "$ENV" != "base" ] && [ ! -d "$CONDA_ENVS/$ENV" ]; then - conda create --prefix $ENV python=3.9 -y + conda create --prefix $ENV python=3.9 -y fi conda activate $ENV pip install vllm