-
Notifications
You must be signed in to change notification settings - Fork 173
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: examples for lmdeploy and tgi oai server (#772)
* feat: lmdeploy and tgi oai server setup * moves to misc folder --------- Co-authored-by: Charles Frye <[email protected]>
- Loading branch information
1 parent
18d5b45
commit 310b213
Showing
2 changed files
with
171 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
import subprocess | ||
|
||
import modal | ||
from modal import App, Image, Secret, gpu | ||
|
||
########## CONSTANTS ########## | ||
|
||
|
||
# define model for serving and path to store in modal container | ||
MODEL_NAME = "meta-llama/Llama-2-7b-hf" | ||
MODEL_DIR = f"/models/{MODEL_NAME}" | ||
SERVE_MODEL_NAME = "meta--llama-2-7b" | ||
HF_SECRET = Secret.from_name("huggingface-secret") | ||
SECONDS = 60 # for timeout | ||
|
||
|
||
########## UTILS FUNCTIONS ########## | ||
|
||
|
||
def download_hf_model(model_dir: str, model_name: str): | ||
"""Retrieve model from HuggingFace Hub and save into | ||
specified path within the modal container. | ||
Args: | ||
model_dir (str): Path to save model weights in container. | ||
model_name (str): HuggingFace Model ID. | ||
""" | ||
import os | ||
|
||
from huggingface_hub import snapshot_download # type: ignore | ||
from transformers.utils import move_cache # type: ignore | ||
|
||
os.makedirs(model_dir, exist_ok=True) | ||
|
||
snapshot_download( | ||
model_name, | ||
local_dir=model_dir, | ||
# consolidated.safetensors is prevent error here: https://github.com/vllm-project/vllm/pull/5005 | ||
ignore_patterns=["*.pt", "*.bin", "consolidated.safetensors"], | ||
token=os.environ["HF_TOKEN"], | ||
) | ||
move_cache() | ||
|
||
|
||
########## IMAGE DEFINITION ########## | ||
|
||
# define image for modal environment | ||
lmdeploy_image = ( | ||
Image.from_registry( | ||
"openmmlab/lmdeploy:v0.4.2", | ||
) | ||
.pip_install(["lmdeploy[all]", "huggingface_hub", "hf-transfer"]) | ||
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) | ||
.run_function( | ||
download_hf_model, | ||
timeout=60 * SECONDS, | ||
kwargs={"model_dir": MODEL_DIR, "model_name": MODEL_NAME}, | ||
secrets=[HF_SECRET], | ||
) | ||
) | ||
|
||
########## APP SETUP ########## | ||
|
||
|
||
app = App(f"lmdeploy-{SERVE_MODEL_NAME}") | ||
|
||
NO_GPU = 1 | ||
TOKEN = "secret12345" | ||
|
||
|
||
@app.function( | ||
image=lmdeploy_image, | ||
gpu=gpu.A10G(count=NO_GPU), | ||
container_idle_timeout=20 * SECONDS, | ||
# https://modal.com/docs/guide/concurrent-inputs | ||
allow_concurrent_inputs=256, # max concurrent input into container | ||
) | ||
@modal.web_server(port=23333, startup_timeout=60 * SECONDS) | ||
def serve(): | ||
cmd = f""" | ||
lmdeploy serve api_server {MODEL_DIR} \ | ||
--model-name {SERVE_MODEL_NAME} \ | ||
--server-port 23333 \ | ||
--session-len 4092 | ||
""" | ||
subprocess.Popen(cmd, shell=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
import subprocess | ||
|
||
import modal | ||
from modal import App, Image, Secret, gpu | ||
|
||
# define model for serving and path to store in modal container | ||
MODEL_NAME = "meta-llama/Llama-2-7b-hf" | ||
MODEL_DIR = f"/models/{MODEL_NAME}" | ||
SERVE_MODEL_NAME = "meta--llama-2-7b" | ||
HF_SECRET = Secret.from_name("huggingface-secret") | ||
SECONDS = 60 # for timeout | ||
|
||
########## UTILS FUNCTIONS ########## | ||
|
||
|
||
def download_hf_model(model_dir: str, model_name: str): | ||
"""Retrieve model from HuggingFace Hub and save into | ||
specified path within the modal container. | ||
Args: | ||
model_dir (str): Path to save model weights in container. | ||
model_name (str): HuggingFace Model ID. | ||
""" | ||
import os | ||
|
||
from huggingface_hub import snapshot_download # type: ignore | ||
from transformers.utils import move_cache # type: ignore | ||
|
||
os.makedirs(model_dir, exist_ok=True) | ||
|
||
snapshot_download( | ||
model_name, | ||
local_dir=model_dir, | ||
# consolidated.safetensors is prevent error here: https://github.com/vllm-project/vllm/pull/5005 | ||
ignore_patterns=["*.pt", "*.bin", "consolidated.safetensors"], | ||
token=os.environ["HF_TOKEN"], | ||
) | ||
move_cache() | ||
|
||
|
||
########## IMAGE DEFINITION ########## | ||
|
||
|
||
# define image for modal environment | ||
tgi_image = ( | ||
Image.from_registry( | ||
"ghcr.io/huggingface/text-generation-inference", add_python="3.10" | ||
) | ||
.dockerfile_commands("ENTRYPOINT []") | ||
.pip_install(["huggingface_hub", "hf-transfer"]) | ||
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) | ||
.run_function( | ||
download_hf_model, | ||
timeout=20 * SECONDS, | ||
kwargs={"model_dir": MODEL_DIR, "model_name": MODEL_NAME}, | ||
secrets=[HF_SECRET], | ||
) | ||
) | ||
|
||
|
||
########## APP SETUP ########## | ||
|
||
|
||
app = App(f"tgi-{SERVE_MODEL_NAME}") | ||
|
||
|
||
NO_GPU = 1 | ||
TOKEN = "secret12345" | ||
|
||
|
||
@app.function( | ||
image=tgi_image, | ||
gpu=gpu.A10G(count=NO_GPU), | ||
container_idle_timeout=20 * SECONDS, | ||
# https://modal.com/docs/guide/concurrent-inputs | ||
allow_concurrent_inputs=256, # max concurrent input into container | ||
) | ||
@modal.web_server(port=3000, startup_timeout=60 * SECONDS) | ||
def serve(): | ||
cmd = f""" | ||
text-generation-launcher --model-id {MODEL_DIR} \ | ||
--hostname 0.0.0.0 \ | ||
--port 3000 | ||
""" | ||
subprocess.Popen(cmd, shell=True) |