Skip to content

Commit

Permalink
rework vllm_inference example to showcase openai-compatible mode
Browse files Browse the repository at this point in the history
  • Loading branch information
charlesfrye committed Jul 31, 2024
1 parent 49af273 commit aa3c95c
Show file tree
Hide file tree
Showing 8 changed files with 456 additions and 443 deletions.
55 changes: 55 additions & 0 deletions 06_gpu_and_ml/llm-serving/download_llama.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import modal

MODELS_DIR = "/llamas"

DEFAULT_NAME = "meta-llama/Meta-Llama-3.1-8B-Instruct"
DEFAULT_REVISION = "8c22764a7e3675c50d4c7c9a4edb474456022b16"

volume = modal.Volume.from_name("llamas", create_if_missing=True)

image = (
modal.Image.debian_slim(python_version="3.10")
.pip_install(
[
"huggingface_hub", # download models from the Hugging Face Hub
"hf-transfer", # download models faster with Rust
]
)
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
)


MINUTES = 60
HOURS = 60 * MINUTES


app = modal.App(image=image, secrets=[modal.Secret.from_name("huggingface")])


@app.function(volumes={MODELS_DIR: volume}, timeout=4 * HOURS)
def download_model(model_name, model_revision):
from huggingface_hub import snapshot_download

volume.reload()

snapshot_download(
model_name,
local_dir=MODELS_DIR,
ignore_patterns=[
"*.pt",
"*.bin",
"*.pth",
"original/*",
], # Ensure safetensors
revision=model_revision,
)

volume.commit()


@app.local_entrypoint()
def main(
model_name: str = DEFAULT_NAME,
model_revision: str = DEFAULT_REVISION,
):
download_model.remote(model_name, model_revision)
Original file line number Diff line number Diff line change
Expand Up @@ -106,22 +106,20 @@ def main():

WORKSPACE = modal.config._profile

client.base_url = (
f"https://{WORKSPACE}--vllm-openai-compatible-serve.modal.run/v1"
)
client.base_url = f"https://{WORKSPACE}--example-vllm-openai-compatible-serve.modal.run/v1"

if args.model:
model_id = args.model
print(
Colors.BOLD,
f"🧠: Using model {model_id}. This may trigger a boot on first call!",
f"🧠: Using model {model_id}. This may trigger a model load on first call!",
Colors.END,
sep="",
)
else:
print(
Colors.BOLD,
f"🔎: Looking up available models on server at {client.base_url}. This may trigger a boot!",
f"🔎: Looking up available models on server at {client.base_url}. This may trigger a model load!",
Colors.END,
sep="",
)
Expand Down
55 changes: 55 additions & 0 deletions 06_gpu_and_ml/llm-serving/openai_compatible/download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import modal

MODELS_DIR = "/llamas"

DEFAULT_NAME = "meta-llama/Meta-Llama-3.1-8B-Instruct"
DEFAULT_REVISION = "8c22764a7e3675c50d4c7c9a4edb474456022b16"

volume = modal.Volume.from_name("llamas", create_if_missing=True)

image = (
modal.Image.debian_slim(python_version="3.10")
.pip_install(
[
"huggingface_hub", # download models from the Hugging Face Hub
"hf-transfer", # download models faster with Rust
]
)
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
)


MINUTES = 60
HOURS = 60 * MINUTES


app = modal.App(image=image, secrets=[modal.Secret.from_name("huggingface")])


@app.function(volumes={MODELS_DIR: volume}, timeout=4 * HOURS)
def download_model(model_name, model_revision):
from huggingface_hub import snapshot_download

volume.reload()

snapshot_download(
model_name,
local_dir=MODELS_DIR,
ignore_patterns=[
"*.pt",
"*.bin",
"*.pth",
"original/*",
], # Ensure safetensors
revision=model_revision,
)

volume.commit()


@app.local_entrypoint()
def main(
model_name: str = DEFAULT_NAME,
model_revision: str = DEFAULT_REVISION,
):
download_model.remote(model_name, model_revision)
91 changes: 91 additions & 0 deletions 06_gpu_and_ml/llm-serving/openai_compatible/load_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import os
from datetime import datetime
from pathlib import Path

import modal

if modal.is_local():
workspace = modal.config._profile
else:
workspace = os.environ["MODAL_WORKSPACE"]


image = (
modal.Image.debian_slim(python_version="3.11")
.pip_install("locust~=2.29.1", "openai~=1.37.1")
.env({"MODAL_WORKSPACE": workspace})
.copy_local_file(
Path(__file__).parent / "locustfile.py",
remote_path="/root/locustfile.py",
)
)
volume = modal.Volume.from_name(
"loadtest-vllm-oai-results", create_if_missing=True
)
remote_path = Path("/root") / "loadtests"
OUT_DIRECTORY = (
remote_path / datetime.utcnow().replace(microsecond=0).isoformat()
)

app = modal.App("loadtest-vllm-oai", image=image, volumes={remote_path: volume})

workers = 8
host = f"https://{workspace}--example-vllm-openai-compatible-serve.modal.run"
csv_file = OUT_DIRECTORY / "stats.csv"
default_args = [
"-H",
host,
"--processes",
str(workers),
"--csv",
csv_file,
]

MINUTES = 60 # seconds


@app.function(allow_concurrent_inputs=1000, cpu=workers)
@modal.web_server(port=8089)
def serve():
run_locust.local(default_args)


@app.function(cpu=workers, timeout=60 * MINUTES)
def run_locust(args: list, wait=False):
import subprocess

process = subprocess.Popen(["locust"] + args)
if wait:
process.wait()
return process.returncode


@app.local_entrypoint()
def main(
r: float = 1.0,
u: int = 36,
t: str = "1m", # no more than the timeout of run_locust, one hour
):
args = default_args + [
"--spawn-rate",
str(r),
"--users",
str(u),
"--run-time",
t,
]

html_report_file = OUT_DIRECTORY / "report.html"
args += [
"--headless", # run without browser UI
"--autostart", # start test immediately
"--autoquit", # stop once finished...
"10", # ...but wait ten seconds
"--html", # output an HTML-formatted report
html_report_file, # to this location
]

if exit_code := run_locust.remote(args, wait=True):
SystemExit(exit_code)
else:
print("finished successfully")
37 changes: 37 additions & 0 deletions 06_gpu_and_ml/llm-serving/openai_compatible/locustfile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import logging
import random

import locust

messages = [
{
"role": "system",
"content": "You are a salesman for Modal, the cloud-native serverless Python computing platform.",
},
{
"role": "user",
"content": "Give me two fun date ideas.",
},
]


class WebsiteUser(locust.HttpUser):
wait_time = locust.between(1, 5)
headers = {
"Authorization": "Bearer super-secret-token",
"Accept": "application/json",
}

@locust.task
def chat_completion(self):
payload = {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"messages": messages,
}

response = self.client.request(
"POST", "/v1/chat/completions", json=payload, headers=self.headers
)
response.raise_for_status()
if random.random() < 0.01:
logging.info(response.json()["choices"][0]["message"]["content"])
Loading

0 comments on commit aa3c95c

Please sign in to comment.