rework vllm_inference example to showcase openai-compatible mode

modal-labs · Jul 31, 2024 · aa3c95c · aa3c95c
1 parent 49af273
commit aa3c95c
Show file tree

Hide file tree

Showing 8 changed files with 456 additions and 443 deletions.
diff --git a/06_gpu_and_ml/llm-serving/download_llama.py b/06_gpu_and_ml/llm-serving/download_llama.py
@@ -0,0 +1,55 @@
+import modal
+
+MODELS_DIR = "/llamas"
+
+DEFAULT_NAME = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+DEFAULT_REVISION = "8c22764a7e3675c50d4c7c9a4edb474456022b16"
+
+volume = modal.Volume.from_name("llamas", create_if_missing=True)
+
+image = (
+    modal.Image.debian_slim(python_version="3.10")
+    .pip_install(
+        [
+            "huggingface_hub",  # download models from the Hugging Face Hub
+            "hf-transfer",  # download models faster with Rust
+        ]
+    )
+    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
+)
+
+
+MINUTES = 60
+HOURS = 60 * MINUTES
+
+
+app = modal.App(image=image, secrets=[modal.Secret.from_name("huggingface")])
+
+
+@app.function(volumes={MODELS_DIR: volume}, timeout=4 * HOURS)
+def download_model(model_name, model_revision):
+    from huggingface_hub import snapshot_download
+
+    volume.reload()
+
+    snapshot_download(
+        model_name,
+        local_dir=MODELS_DIR,
+        ignore_patterns=[
+            "*.pt",
+            "*.bin",
+            "*.pth",
+            "original/*",
+        ],  # Ensure safetensors
+        revision=model_revision,
+    )
+
+    volume.commit()
+
+
+@app.local_entrypoint()
+def main(
+    model_name: str = DEFAULT_NAME,
+    model_revision: str = DEFAULT_REVISION,
+):
+    download_model.remote(model_name, model_revision)
diff --git a/...llm-serving/vllm_oai_compatible/client.py → ...l/llm-serving/openai_compatible/client.py b/...llm-serving/vllm_oai_compatible/client.py → ...l/llm-serving/openai_compatible/client.py
@@ -106,22 +106,20 @@ def main():
 
     WORKSPACE = modal.config._profile
 
-    client.base_url = (
-        f"https://{WORKSPACE}--vllm-openai-compatible-serve.modal.run/v1"
-    )
+    client.base_url = f"https://{WORKSPACE}--example-vllm-openai-compatible-serve.modal.run/v1"
 
     if args.model:
         model_id = args.model
         print(
             Colors.BOLD,
-            f"🧠: Using model {model_id}. This may trigger a boot on first call!",
+            f"🧠: Using model {model_id}. This may trigger a model load on first call!",
             Colors.END,
             sep="",
         )
     else:
         print(
             Colors.BOLD,
-            f"🔎: Looking up available models on server at {client.base_url}. This may trigger a boot!",
+            f"🔎: Looking up available models on server at {client.base_url}. This may trigger a model load!",
             Colors.END,
             sep="",
         )

diff --git a/06_gpu_and_ml/llm-serving/openai_compatible/download.py b/06_gpu_and_ml/llm-serving/openai_compatible/download.py
@@ -0,0 +1,55 @@
+import modal
+
+MODELS_DIR = "/llamas"
+
+DEFAULT_NAME = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+DEFAULT_REVISION = "8c22764a7e3675c50d4c7c9a4edb474456022b16"
+
+volume = modal.Volume.from_name("llamas", create_if_missing=True)
+
+image = (
+    modal.Image.debian_slim(python_version="3.10")
+    .pip_install(
+        [
+            "huggingface_hub",  # download models from the Hugging Face Hub
+            "hf-transfer",  # download models faster with Rust
+        ]
+    )
+    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
+)
+
+
+MINUTES = 60
+HOURS = 60 * MINUTES
+
+
+app = modal.App(image=image, secrets=[modal.Secret.from_name("huggingface")])
+
+
+@app.function(volumes={MODELS_DIR: volume}, timeout=4 * HOURS)
+def download_model(model_name, model_revision):
+    from huggingface_hub import snapshot_download
+
+    volume.reload()
+
+    snapshot_download(
+        model_name,
+        local_dir=MODELS_DIR,
+        ignore_patterns=[
+            "*.pt",
+            "*.bin",
+            "*.pth",
+            "original/*",
+        ],  # Ensure safetensors
+        revision=model_revision,
+    )
+
+    volume.commit()
+
+
+@app.local_entrypoint()
+def main(
+    model_name: str = DEFAULT_NAME,
+    model_revision: str = DEFAULT_REVISION,
+):
+    download_model.remote(model_name, model_revision)
diff --git a/06_gpu_and_ml/llm-serving/openai_compatible/load_test.py b/06_gpu_and_ml/llm-serving/openai_compatible/load_test.py
@@ -0,0 +1,91 @@
+import os
+from datetime import datetime
+from pathlib import Path
+
+import modal
+
+if modal.is_local():
+    workspace = modal.config._profile
+else:
+    workspace = os.environ["MODAL_WORKSPACE"]
+
+
+image = (
+    modal.Image.debian_slim(python_version="3.11")
+    .pip_install("locust~=2.29.1", "openai~=1.37.1")
+    .env({"MODAL_WORKSPACE": workspace})
+    .copy_local_file(
+        Path(__file__).parent / "locustfile.py",
+        remote_path="/root/locustfile.py",
+    )
+)
+volume = modal.Volume.from_name(
+    "loadtest-vllm-oai-results", create_if_missing=True
+)
+remote_path = Path("/root") / "loadtests"
+OUT_DIRECTORY = (
+    remote_path / datetime.utcnow().replace(microsecond=0).isoformat()
+)
+
+app = modal.App("loadtest-vllm-oai", image=image, volumes={remote_path: volume})
+
+workers = 8
+host = f"https://{workspace}--example-vllm-openai-compatible-serve.modal.run"
+csv_file = OUT_DIRECTORY / "stats.csv"
+default_args = [
+    "-H",
+    host,
+    "--processes",
+    str(workers),
+    "--csv",
+    csv_file,
+]
+
+MINUTES = 60  # seconds
+
+
+@app.function(allow_concurrent_inputs=1000, cpu=workers)
+@modal.web_server(port=8089)
+def serve():
+    run_locust.local(default_args)
+
+
+@app.function(cpu=workers, timeout=60 * MINUTES)
+def run_locust(args: list, wait=False):
+    import subprocess
+
+    process = subprocess.Popen(["locust"] + args)
+    if wait:
+        process.wait()
+        return process.returncode
+
+
+@app.local_entrypoint()
+def main(
+    r: float = 1.0,
+    u: int = 36,
+    t: str = "1m",  # no more than the timeout of run_locust, one hour
+):
+    args = default_args + [
+        "--spawn-rate",
+        str(r),
+        "--users",
+        str(u),
+        "--run-time",
+        t,
+    ]
+
+    html_report_file = OUT_DIRECTORY / "report.html"
+    args += [
+        "--headless",  # run without browser UI
+        "--autostart",  # start test immediately
+        "--autoquit",  # stop once finished...
+        "10",  # ...but wait ten seconds
+        "--html",  # output an HTML-formatted report
+        html_report_file,  # to this location
+    ]
+
+    if exit_code := run_locust.remote(args, wait=True):
+        SystemExit(exit_code)
+    else:
+        print("finished successfully")
diff --git a/06_gpu_and_ml/llm-serving/openai_compatible/locustfile.py b/06_gpu_and_ml/llm-serving/openai_compatible/locustfile.py
@@ -0,0 +1,37 @@
+import logging
+import random
+
+import locust
+
+messages = [
+    {
+        "role": "system",
+        "content": "You are a salesman for Modal, the cloud-native serverless Python computing platform.",
+    },
+    {
+        "role": "user",
+        "content": "Give me two fun date ideas.",
+    },
+]
+
+
+class WebsiteUser(locust.HttpUser):
+    wait_time = locust.between(1, 5)
+    headers = {
+        "Authorization": "Bearer super-secret-token",
+        "Accept": "application/json",
+    }
+
+    @locust.task
+    def chat_completion(self):
+        payload = {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "messages": messages,
+        }
+
+        response = self.client.request(
+            "POST", "/v1/chat/completions", json=payload, headers=self.headers
+        )
+        response.raise_for_status()
+        if random.random() < 0.01:
+            logging.info(response.json()["choices"][0]["message"]["content"])