From f980db7f8a7a9f58facac05404a21de92ba445b9 Mon Sep 17 00:00:00 2001
From: Setepenre <pierre.delaunay.tr@gmail.com>
Date: Wed, 4 Oct 2023 13:18:23 +0000
Subject: [PATCH 1/4] Add vLLM example

---
 docs/examples/llm/client.py           | 34 +++++++++++++++
 docs/examples/llm/inference_server.sh | 59 +++++++++++++++++++++++++++
 docs/examples/llm/requirements.txt    |  2 +
 docs/examples/llm/vllm.rst            | 13 ++++++
 4 files changed, 108 insertions(+)
 create mode 100644 docs/examples/llm/client.py
 create mode 100644 docs/examples/llm/inference_server.sh
 create mode 100644 docs/examples/llm/requirements.txt
 create mode 100644 docs/examples/llm/vllm.rst

diff --git a/docs/examples/llm/client.py b/docs/examples/llm/client.py
new file mode 100644
index 00000000..63cabf5f
--- /dev/null
+++ b/docs/examples/llm/client.py
@@ -0,0 +1,34 @@
+import openai
+
+def parse_meta(comment):
+    data = dict()
+    if comment != "(null)":
+        items = comment.split("|")
+        for kv in items:
+            try:
+                k, v = kv.split("=", maxsplit=1)
+                data[k] = v
+            except:
+                pass
+
+    return data
+
+
+def get_job_comment(name="inference_server.sh"):
+    command = ["squeue", "-h", f"--name={name}", '--format="%k"']
+
+    return subprocess.check_output(command, text=True).replace('"', "")
+
+
+server = parse_meta(get_job_comment())
+
+openai.api_key = "EMPTY"
+openai.api_base = f"http://{server['host']}:{server['port']}/v1"
+
+
+completion = openai.Completion.create(
+    model=server['model'], 
+    prompt=args.prompt
+)
+
+print(completion)
\ No newline at end of file
diff --git a/docs/examples/llm/inference_server.sh b/docs/examples/llm/inference_server.sh
new file mode 100644
index 00000000..985a4ee5
--- /dev/null
+++ b/docs/examples/llm/inference_server.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+#
+# Assume you have conda installed
+#
+# Usage:
+#   
+#     sbatch --ntasks-per-node=1 --mem=32G inference_server_SHARED.sh meta/Llama-2-7b-chat-hf
+#     sbatch --ntasks-per-node=2 --mem=64G inference_server_SHARED.sh meta/Llama-2-13b-chat-hf
+#     sbatch --ntasks-per-node=8 --mem=192G inference_server_SHARED.sh meta/Llama-2-70b-chat-hf
+#
+
+#SBATCH --gpus-per-task=rtx8000:1
+#SBATCH --cpus-per-task=4
+#SBATCH --time=00:15:00
+#SBATCH --ntasks-per-node=1
+#SBATCH --mem=32G
+
+MODEL="$1"
+PATH="$2"
+
+export MILA_WEIGHTS="/network/weights/"
+
+cd $SLURM_TMPDIR
+
+#
+#   Fix problem with conda saying it is not "init properly"
+#
+CONDA_EXEC="$(which conda)"
+CONDA_BASE=$(dirname $CONDA_EXEC)
+source $CONDA_BASE/../etc/profile.d/conda.sh
+
+#
+#   Create a new environment
+#
+conda create --prefix ./env python=3.9 -y 
+conda activate ./env
+pip install vllm
+
+#
+#   Save metadata for retrival
+#
+
+PORT=$(python -c "import socket; sock = socket.socket(); sock.bind(('', 0)); print(sock.getsockname()[1])")
+HOST="$(hostname)"
+NAME="$WEIGHTS/$MODEL"
+
+echo " -> $HOST:$PORT"
+scontrol update job $SLURM_JOB_ID comment="model=$MODEL|host=$HOST|port=$PORT|shared=y"
+
+# 
+#   Launch Server
+#
+python -m vllm.entrypoints.openai.api_server       \
+     --host $HOST                                  \
+     --port $PORT                                  \
+     --model "$MODEL"                              \
+     --tensor-parallel-size $SLURM_NTASKS_PER_NODE \
+     --served-model-name "$MODEL"
diff --git a/docs/examples/llm/requirements.txt b/docs/examples/llm/requirements.txt
new file mode 100644
index 00000000..fc04a682
--- /dev/null
+++ b/docs/examples/llm/requirements.txt
@@ -0,0 +1,2 @@
+vllm
+openai
diff --git a/docs/examples/llm/vllm.rst b/docs/examples/llm/vllm.rst
new file mode 100644
index 00000000..0715ce2c
--- /dev/null
+++ b/docs/examples/llm/vllm.rst
@@ -0,0 +1,13 @@
+LLM Inference
+=============
+
+
+
+Dependencies
+------------
+
+.. code-block:: 
+
+   sbatch inference_server.sh Llama-2-7b-chat-hf /network/weights/llama.var/llama2/Llama-2-7b-chat-hf
+
+

From 4d3c06ad7abff7f0611f8d019014450e54cccd14 Mon Sep 17 00:00:00 2001
From: Setepenre <pierre.delaunay.tr@gmail.com>
Date: Wed, 4 Oct 2023 13:41:26 +0000
Subject: [PATCH 2/4] Add walk through

---
 docs/examples/llm/client.py           |  6 +++
 docs/examples/llm/inference_server.sh | 55 ++++++++++++++++++++++-----
 docs/examples/llm/vllm.rst            | 52 +++++++++++++++++++++++--
 3 files changed, 101 insertions(+), 12 deletions(-)

diff --git a/docs/examples/llm/client.py b/docs/examples/llm/client.py
index 63cabf5f..86e3ce7e 100644
--- a/docs/examples/llm/client.py
+++ b/docs/examples/llm/client.py
@@ -1,5 +1,9 @@
 import openai
 
+
+#
+# Parse the server info from the job comment
+#
 def parse_meta(comment):
     data = dict()
     if comment != "(null)":
@@ -22,10 +26,12 @@ def get_job_comment(name="inference_server.sh"):
 
 server = parse_meta(get_job_comment())
 
+# Override OpenAPI API URL with out custom server
 openai.api_key = "EMPTY"
 openai.api_base = f"http://{server['host']}:{server['port']}/v1"
 
 
+# profit
 completion = openai.Completion.create(
     model=server['model'], 
     prompt=args.prompt
diff --git a/docs/examples/llm/inference_server.sh b/docs/examples/llm/inference_server.sh
index 985a4ee5..acf2a857 100644
--- a/docs/examples/llm/inference_server.sh
+++ b/docs/examples/llm/inference_server.sh
@@ -16,8 +16,45 @@
 #SBATCH --ntasks-per-node=1
 #SBATCH --mem=32G
 
-MODEL="$1"
-PATH="$2"
+usage() {
+  echo "Usage: $0 [-m] [-p] 
+  echo "  -h              Display this help message."
+  echo "  -m MODEL        Specify a file to process."
+  echo "  -p PATH         Specify a directory to work in."
+  echo "  ARGUMENT        Any additional argument you want to process."
+  exit 1
+}
+
+MODEL=""
+PATH=""
+ENV="./env"
+
+
+while getopts ":hf:d:" opt; do
+  case $opt in
+    h)
+      usage
+      ;;
+    m)
+      MODEL="$OPTARG"
+      ;;
+    p)
+      PATH="$OPTARG"
+      ;;
+    e)
+      ENV="$OPTARG"
+      ;;
+    \?)
+      echo "Invalid option: -$OPTARG" >&2
+      usage
+      ;;
+    :)
+      echo "Option -$OPTARG requires an argument." >&2
+      usage
+      ;;
+  esac
+done
+
 
 export MILA_WEIGHTS="/network/weights/"
 
@@ -33,19 +70,19 @@ source $CONDA_BASE/../etc/profile.d/conda.sh
 #
 #   Create a new environment
 #
-conda create --prefix ./env python=3.9 -y 
-conda activate ./env
+if [ ! -d "$ENV" ]; then
+     conda create --prefix $ENV python=3.9 -y 
+fi
+conda activate $ENV
 pip install vllm
 
-#
-#   Save metadata for retrival
-#
-
 PORT=$(python -c "import socket; sock = socket.socket(); sock.bind(('', 0)); print(sock.getsockname()[1])")
 HOST="$(hostname)"
 NAME="$WEIGHTS/$MODEL"
 
-echo " -> $HOST:$PORT"
+#
+#   Save metadata for retrival
+#
 scontrol update job $SLURM_JOB_ID comment="model=$MODEL|host=$HOST|port=$PORT|shared=y"
 
 # 
diff --git a/docs/examples/llm/vllm.rst b/docs/examples/llm/vllm.rst
index 0715ce2c..e55c1291 100644
--- a/docs/examples/llm/vllm.rst
+++ b/docs/examples/llm/vllm.rst
@@ -2,12 +2,58 @@ LLM Inference
 =============
 
 
+Server
+------
+
+`vLLM <https://github.com/vllm-project/vllm>`_ comes with its own server entry point that mimicks OpenAI's API.
+It is very easy to setup and supports a wide range of models through Huggingfaces.
+
+
+.. code-block:: 
+
+   # sbatch inference_server.sh -m MODEL_NAME -p WEIGHT_PATH -e CONDA_ENV_NAME_TO_USE
+   sbatch inference_server.sh -m Llama-2-7b-chat-hf -p /network/weights/llama.var/llama2/Llama-2-7b-chat-hf -e base
+
+
+By default the script will launch the server on an rtx8000 for 15 minutes.
+You can override the defaults by specifying arguments to sbatch.
 
-Dependencies
-------------
 
 .. code-block:: 
 
-   sbatch inference_server.sh Llama-2-7b-chat-hf /network/weights/llama.var/llama2/Llama-2-7b-chat-hf
+   sbatch --time=00:30:00 inference_server.sh -m Llama-2-7b-chat-hf -p /network/weights/llama.var/llama2/Llama-2-7b-chat-hf -e base
+
+.. note::
+
+    We are using job comment to store hostname, port and model names,
+    which enable the client to automatically pick them up on its side.
+
+
+.. literalinclude:: inference_server.sh
+    :language: bash
+
+
+Client
+------
+
+Becasue vLLM replicates OpenAI's API, the client side is quite straight forward.
+Own OpenAI's client can be reused. 
+
+.. warning::
+   
+   The server takes a while to setup you might to have to wait a few minutes
+   before the server is ready for inference.
+
+   You can check the job log of the server.
+   Look for 
+
+
+.. note::
+
+   We use squeue to look for the inference server job to configure the 
+   url endpoint automatically.
 
+   Make sure your job name is unique!
 
+.. literalinclude:: client.py
+    :language: python

From 7d16b5be1470cf034a5ad780082033891ff3a749 Mon Sep 17 00:00:00 2001
From: "pierre.delaunay" <delaunap@rtx5.server.mila.quebec>
Date: Wed, 4 Oct 2023 11:00:19 -0400
Subject: [PATCH 3/4] Tweaks

---
 docs/examples/llm/client.py           |  8 +++++---
 docs/examples/llm/inference_server.sh | 22 +++++++++++++---------
 docs/examples/llm/vllm.rst            | 20 ++++++++++++--------
 3 files changed, 30 insertions(+), 20 deletions(-)

diff --git a/docs/examples/llm/client.py b/docs/examples/llm/client.py
index 86e3ce7e..756761ae 100644
--- a/docs/examples/llm/client.py
+++ b/docs/examples/llm/client.py
@@ -1,3 +1,5 @@
+import subprocess
+
 import openai
 
 
@@ -33,8 +35,8 @@ def get_job_comment(name="inference_server.sh"):
 
 # profit
 completion = openai.Completion.create(
-    model=server['model'], 
-    prompt=args.prompt
+    model=server['model'],
+    prompt="What is the square root of 25 ?"
 )
 
-print(completion)
\ No newline at end of file
+print(completion)
diff --git a/docs/examples/llm/inference_server.sh b/docs/examples/llm/inference_server.sh
index acf2a857..fdc1f69c 100644
--- a/docs/examples/llm/inference_server.sh
+++ b/docs/examples/llm/inference_server.sh
@@ -16,21 +16,22 @@
 #SBATCH --ntasks-per-node=1
 #SBATCH --mem=32G
 
-usage() {
-  echo "Usage: $0 [-m] [-p] 
+function usage() {
+  echo "Usage: $0 [-m] [-p]"
   echo "  -h              Display this help message."
   echo "  -m MODEL        Specify a file to process."
   echo "  -p PATH         Specify a directory to work in."
+  echo "  -e ENV          Specify the conda environementt to use."
   echo "  ARGUMENT        Any additional argument you want to process."
   exit 1
 }
 
 MODEL=""
-PATH=""
+MODEL_PATH=""
 ENV="./env"
 
 
-while getopts ":hf:d:" opt; do
+while getopts ":hm:p:e:" opt; do
   case $opt in
     h)
       usage
@@ -39,7 +40,7 @@ while getopts ":hf:d:" opt; do
       MODEL="$OPTARG"
       ;;
     p)
-      PATH="$OPTARG"
+      MODEL_PATH="$OPTARG"
       ;;
     e)
       ENV="$OPTARG"
@@ -55,9 +56,11 @@ while getopts ":hf:d:" opt; do
   esac
 done
 
+echo "model: $MODEL"
+echo " path: $MODEL_PATH"
+echo "  env: $ENV"
 
 export MILA_WEIGHTS="/network/weights/"
-
 cd $SLURM_TMPDIR
 
 #
@@ -65,12 +68,13 @@ cd $SLURM_TMPDIR
 #
 CONDA_EXEC="$(which conda)"
 CONDA_BASE=$(dirname $CONDA_EXEC)
+CONDA_ENVS="$CONDA_BASE/../envs"
 source $CONDA_BASE/../etc/profile.d/conda.sh
 
 #
 #   Create a new environment
 #
-if [ ! -d "$ENV" ]; then
+if [ ! -d "$ENV" ] && [ "$ENV" != "base" ] && [ ! -d "$CONDA_ENVS/$ENV" ]; then
      conda create --prefix $ENV python=3.9 -y 
 fi
 conda activate $ENV
@@ -85,12 +89,12 @@ NAME="$WEIGHTS/$MODEL"
 #
 scontrol update job $SLURM_JOB_ID comment="model=$MODEL|host=$HOST|port=$PORT|shared=y"
 
-# 
+#
 #   Launch Server
 #
 python -m vllm.entrypoints.openai.api_server       \
      --host $HOST                                  \
      --port $PORT                                  \
-     --model "$MODEL"                              \
+     --model "$MODEL_PATH"                         \
      --tensor-parallel-size $SLURM_NTASKS_PER_NODE \
      --served-model-name "$MODEL"
diff --git a/docs/examples/llm/vllm.rst b/docs/examples/llm/vllm.rst
index e55c1291..b6501969 100644
--- a/docs/examples/llm/vllm.rst
+++ b/docs/examples/llm/vllm.rst
@@ -9,7 +9,7 @@ Server
 It is very easy to setup and supports a wide range of models through Huggingfaces.
 
 
-.. code-block:: 
+.. code-block::
 
    # sbatch inference_server.sh -m MODEL_NAME -p WEIGHT_PATH -e CONDA_ENV_NAME_TO_USE
    sbatch inference_server.sh -m Llama-2-7b-chat-hf -p /network/weights/llama.var/llama2/Llama-2-7b-chat-hf -e base
@@ -19,7 +19,7 @@ By default the script will launch the server on an rtx8000 for 15 minutes.
 You can override the defaults by specifying arguments to sbatch.
 
 
-.. code-block:: 
+.. code-block::
 
    sbatch --time=00:30:00 inference_server.sh -m Llama-2-7b-chat-hf -p /network/weights/llama.var/llama2/Llama-2-7b-chat-hf -e base
 
@@ -36,24 +36,28 @@ You can override the defaults by specifying arguments to sbatch.
 Client
 ------
 
-Becasue vLLM replicates OpenAI's API, the client side is quite straight forward.
-Own OpenAI's client can be reused. 
+Because vLLM replicates OpenAI's API, the client side is quite straight forward and
+own OpenAI's client can be reused.
 
 .. warning::
-   
+
    The server takes a while to setup you might to have to wait a few minutes
    before the server is ready for inference.
 
-   You can check the job log of the server.
-   Look for 
+   You can check the job log of the server using ``tail -f slurm-<JOB-ID>.out`` to
+   see the log as it is written.
+
+   Look for ``Uvicorn running on http://... (Press CTRL+C to quit)``
+   to know when the server is ready to receive requests.
 
 
 .. note::
 
-   We use squeue to look for the inference server job to configure the 
+   We use ``squeue`` to look for the inference server job to configure the
    url endpoint automatically.
 
    Make sure your job name is unique!
 
+
 .. literalinclude:: client.py
     :language: python

From c67719325a06f0c5c1dd79a514057bf83e586a81 Mon Sep 17 00:00:00 2001
From: Setepenre <pierre.delaunay.tr@gmail.com>
Date: Wed, 4 Oct 2023 15:11:36 +0000
Subject: [PATCH 4/4] Remove out-dated comment

---
 docs/examples/llm/inference_server.sh | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/docs/examples/llm/inference_server.sh b/docs/examples/llm/inference_server.sh
index fdc1f69c..350e69a1 100644
--- a/docs/examples/llm/inference_server.sh
+++ b/docs/examples/llm/inference_server.sh
@@ -1,15 +1,5 @@
 #!/bin/bash
 
-#
-# Assume you have conda installed
-#
-# Usage:
-#   
-#     sbatch --ntasks-per-node=1 --mem=32G inference_server_SHARED.sh meta/Llama-2-7b-chat-hf
-#     sbatch --ntasks-per-node=2 --mem=64G inference_server_SHARED.sh meta/Llama-2-13b-chat-hf
-#     sbatch --ntasks-per-node=8 --mem=192G inference_server_SHARED.sh meta/Llama-2-70b-chat-hf
-#
-
 #SBATCH --gpus-per-task=rtx8000:1
 #SBATCH --cpus-per-task=4
 #SBATCH --time=00:15:00
@@ -19,8 +9,8 @@
 function usage() {
   echo "Usage: $0 [-m] [-p]"
   echo "  -h              Display this help message."
-  echo "  -m MODEL        Specify a file to process."
-  echo "  -p PATH         Specify a directory to work in."
+  echo "  -m MODEL        Specify the model name"
+  echo "  -p PATH         Specify the model weights"
   echo "  -e ENV          Specify the conda environementt to use."
   echo "  ARGUMENT        Any additional argument you want to process."
   exit 1
@@ -75,7 +65,7 @@ source $CONDA_BASE/../etc/profile.d/conda.sh
 #   Create a new environment
 #
 if [ ! -d "$ENV" ] && [ "$ENV" != "base" ] && [ ! -d "$CONDA_ENVS/$ENV" ]; then
-     conda create --prefix $ENV python=3.9 -y 
+     conda create --prefix $ENV python=3.9 -y
 fi
 conda activate $ENV
 pip install vllm