From f980db7f8a7a9f58facac05404a21de92ba445b9 Mon Sep 17 00:00:00 2001
From: Setepenre <pierre.delaunay.tr@gmail.com>
Date: Wed, 4 Oct 2023 13:18:23 +0000
Subject: [PATCH] Add vLLM example

---
 docs/examples/llm/client.py           | 34 +++++++++++++++
 docs/examples/llm/inference_server.sh | 59 +++++++++++++++++++++++++++
 docs/examples/llm/requirements.txt    |  2 +
 docs/examples/llm/vllm.rst            | 13 ++++++
 4 files changed, 108 insertions(+)
 create mode 100644 docs/examples/llm/client.py
 create mode 100644 docs/examples/llm/inference_server.sh
 create mode 100644 docs/examples/llm/requirements.txt
 create mode 100644 docs/examples/llm/vllm.rst

diff --git a/docs/examples/llm/client.py b/docs/examples/llm/client.py
new file mode 100644
index 00000000..63cabf5f
--- /dev/null
+++ b/docs/examples/llm/client.py
@@ -0,0 +1,34 @@
+import openai
+
+def parse_meta(comment):
+    data = dict()
+    if comment != "(null)":
+        items = comment.split("|")
+        for kv in items:
+            try:
+                k, v = kv.split("=", maxsplit=1)
+                data[k] = v
+            except:
+                pass
+
+    return data
+
+
+def get_job_comment(name="inference_server.sh"):
+    command = ["squeue", "-h", f"--name={name}", '--format="%k"']
+
+    return subprocess.check_output(command, text=True).replace('"', "")
+
+
+server = parse_meta(get_job_comment())
+
+openai.api_key = "EMPTY"
+openai.api_base = f"http://{server['host']}:{server['port']}/v1"
+
+
+completion = openai.Completion.create(
+    model=server['model'], 
+    prompt=args.prompt
+)
+
+print(completion)
\ No newline at end of file
diff --git a/docs/examples/llm/inference_server.sh b/docs/examples/llm/inference_server.sh
new file mode 100644
index 00000000..985a4ee5
--- /dev/null
+++ b/docs/examples/llm/inference_server.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+#
+# Assume you have conda installed
+#
+# Usage:
+#   
+#     sbatch --ntasks-per-node=1 --mem=32G inference_server_SHARED.sh meta/Llama-2-7b-chat-hf
+#     sbatch --ntasks-per-node=2 --mem=64G inference_server_SHARED.sh meta/Llama-2-13b-chat-hf
+#     sbatch --ntasks-per-node=8 --mem=192G inference_server_SHARED.sh meta/Llama-2-70b-chat-hf
+#
+
+#SBATCH --gpus-per-task=rtx8000:1
+#SBATCH --cpus-per-task=4
+#SBATCH --time=00:15:00
+#SBATCH --ntasks-per-node=1
+#SBATCH --mem=32G
+
+MODEL="$1"
+PATH="$2"
+
+export MILA_WEIGHTS="/network/weights/"
+
+cd $SLURM_TMPDIR
+
+#
+#   Fix problem with conda saying it is not "init properly"
+#
+CONDA_EXEC="$(which conda)"
+CONDA_BASE=$(dirname $CONDA_EXEC)
+source $CONDA_BASE/../etc/profile.d/conda.sh
+
+#
+#   Create a new environment
+#
+conda create --prefix ./env python=3.9 -y 
+conda activate ./env
+pip install vllm
+
+#
+#   Save metadata for retrival
+#
+
+PORT=$(python -c "import socket; sock = socket.socket(); sock.bind(('', 0)); print(sock.getsockname()[1])")
+HOST="$(hostname)"
+NAME="$WEIGHTS/$MODEL"
+
+echo " -> $HOST:$PORT"
+scontrol update job $SLURM_JOB_ID comment="model=$MODEL|host=$HOST|port=$PORT|shared=y"
+
+# 
+#   Launch Server
+#
+python -m vllm.entrypoints.openai.api_server       \
+     --host $HOST                                  \
+     --port $PORT                                  \
+     --model "$MODEL"                              \
+     --tensor-parallel-size $SLURM_NTASKS_PER_NODE \
+     --served-model-name "$MODEL"
diff --git a/docs/examples/llm/requirements.txt b/docs/examples/llm/requirements.txt
new file mode 100644
index 00000000..fc04a682
--- /dev/null
+++ b/docs/examples/llm/requirements.txt
@@ -0,0 +1,2 @@
+vllm
+openai
diff --git a/docs/examples/llm/vllm.rst b/docs/examples/llm/vllm.rst
new file mode 100644
index 00000000..0715ce2c
--- /dev/null
+++ b/docs/examples/llm/vllm.rst
@@ -0,0 +1,13 @@
+LLM Inference
+=============
+
+
+
+Dependencies
+------------
+
+.. code-block:: 
+
+   sbatch inference_server.sh Llama-2-7b-chat-hf /network/weights/llama.var/llama2/Llama-2-7b-chat-hf
+
+