mila-iqia · Delaunay · Oct 4, 2023 · Oct 4, 2023 · Oct 4, 2023 · Oct 4, 2023
@@ -0,0 +1,42 @@
+import subprocess
+
+import openai
+
+
+#
+# Parse the server info from the job comment
+#
+def parse_meta(comment):
+    data = dict()
+    if comment != "(null)":
+        items = comment.split("|")
+        for kv in items:
+            try:
+                k, v = kv.split("=", maxsplit=1)
+                data[k] = v
+            except:
+                pass
+
+    return data
+
+
+def get_job_comment(name="inference_server.sh"):
+    command = ["squeue", "-h", f"--name={name}", '--format="%k"']
+
+    return subprocess.check_output(command, text=True).replace('"', "")
+
+
+server = parse_meta(get_job_comment())
+
+# Override OpenAPI API URL with out custom server
+openai.api_key = "EMPTY"
+openai.api_base = f"http://{server['host']}:{server['port']}/v1"
+
+
+# profit
+completion = openai.Completion.create(
+    model=server['model'],
+    prompt="What is the square root of 25 ?"
+)
+
+print(completion)
@@ -0,0 +1,90 @@
+#!/bin/bash
+
+#SBATCH --gpus-per-task=rtx8000:1
+#SBATCH --cpus-per-task=4
+#SBATCH --time=00:15:00
+#SBATCH --ntasks-per-node=1
+#SBATCH --mem=32G
+
+function usage() {
+  echo "Usage: $0 [-m] [-p]"
+  echo "  -h              Display this help message."
+  echo "  -m MODEL        Specify the model name"
+  echo "  -p PATH         Specify the model weights"
+  echo "  -e ENV          Specify the conda environementt to use."
+  echo "  ARGUMENT        Any additional argument you want to process."
+  exit 1
+}
+
+MODEL=""
+MODEL_PATH=""
+ENV="./env"
+
+
+while getopts ":hm:p:e:" opt; do
+  case $opt in
+    h)
+      usage
+      ;;
+    m)
+      MODEL="$OPTARG"
+      ;;
+    p)
+      MODEL_PATH="$OPTARG"
+      ;;
+    e)
+      ENV="$OPTARG"
+      ;;
+    \?)
+      echo "Invalid option: -$OPTARG" >&2
+      usage
+      ;;
+    :)
+      echo "Option -$OPTARG requires an argument." >&2
+      usage
+      ;;
+  esac
+done
+
+echo "model: $MODEL"
+echo " path: $MODEL_PATH"
+echo "  env: $ENV"
+
+export MILA_WEIGHTS="/network/weights/"
+cd $SLURM_TMPDIR
+
+#
+#   Fix problem with conda saying it is not "init properly"
+#
+CONDA_EXEC="$(which conda)"
+CONDA_BASE=$(dirname $CONDA_EXEC)
+CONDA_ENVS="$CONDA_BASE/../envs"
+source $CONDA_BASE/../etc/profile.d/conda.sh
+
+#
+#   Create a new environment
+#
+if [ ! -d "$ENV" ] && [ "$ENV" != "base" ] && [ ! -d "$CONDA_ENVS/$ENV" ]; then
+     conda create --prefix $ENV python=3.9 -y
+fi
+conda activate $ENV
+pip install vllm
+
+PORT=$(python -c "import socket; sock = socket.socket(); sock.bind(('', 0)); print(sock.getsockname()[1])")
+HOST="$(hostname)"
+NAME="$WEIGHTS/$MODEL"
+
+#
+#   Save metadata for retrival
+#
+scontrol update job $SLURM_JOB_ID comment="model=$MODEL|host=$HOST|port=$PORT|shared=y"
+
+#
+#   Launch Server
+#
+python -m vllm.entrypoints.openai.api_server       \
+     --host $HOST                                  \
+     --port $PORT                                  \
+     --model "$MODEL_PATH"                         \
+     --tensor-parallel-size $SLURM_NTASKS_PER_NODE \
+     --served-model-name "$MODEL"
@@ -0,0 +1,2 @@
+vllm
+openai
@@ -0,0 +1,63 @@
+LLM Inference
+=============
+
+
+Server
+------
+
+`vLLM <https://github.com/vllm-project/vllm>`_ comes with its own server entry point that mimicks OpenAI's API.
+It is very easy to setup and supports a wide range of models through Huggingfaces.
+
+
+.. code-block::
+
+   # sbatch inference_server.sh -m MODEL_NAME -p WEIGHT_PATH -e CONDA_ENV_NAME_TO_USE
+   sbatch inference_server.sh -m Llama-2-7b-chat-hf -p /network/weights/llama.var/llama2/Llama-2-7b-chat-hf -e base
+
+
+By default the script will launch the server on an rtx8000 for 15 minutes.
+You can override the defaults by specifying arguments to sbatch.
+
+
+.. code-block::
+
+   sbatch --time=00:30:00 inference_server.sh -m Llama-2-7b-chat-hf -p /network/weights/llama.var/llama2/Llama-2-7b-chat-hf -e base
+
+.. note::
+
+    We are using job comment to store hostname, port and model names,
+    which enable the client to automatically pick them up on its side.
+
+
+.. literalinclude:: inference_server.sh
+    :language: bash
+
+
+Client
+------
+
+Because vLLM replicates OpenAI's API, the client side is quite straight forward and
+own OpenAI's client can be reused.
+
+.. warning::
+
+   The server takes a while to setup you might to have to wait a few minutes
+   before the server is ready for inference.
+
+   You can check the job log of the server using ``tail -f slurm-<JOB-ID>.out`` to
+   see the log as it is written.
+
+   Look for ``Uvicorn running on http://... (Press CTRL+C to quit)``
+   to know when the server is ready to receive requests.
+
+
+.. note::
+
+   We use ``squeue`` to look for the inference server job to configure the
+   url endpoint automatically.
+
+   Make sure your job name is unique!
+
+
+.. literalinclude:: client.py
+    :language: python