Skip to content

Commit

Permalink
[V1] LoRA - Enable Serving Usecase (vllm-project#12883)
Browse files Browse the repository at this point in the history
Signed-off-by: Varun Sundar Rabindranath <[email protected]>
Co-authored-by: Varun Sundar Rabindranath <[email protected]>
  • Loading branch information
varun-sundar-rabindranath and Varun Sundar Rabindranath authored Feb 14, 2025
1 parent f0b2da7 commit cbc4012
Show file tree
Hide file tree
Showing 7 changed files with 210 additions and 7 deletions.
165 changes: 165 additions & 0 deletions tests/lora/test_add_lora.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
# SPDX-License-Identifier: Apache-2.0
import asyncio
import time
from pathlib import Path
from typing import List

import pytest
from huggingface_hub import snapshot_download

from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.inputs import TextPrompt
from vllm.lora.request import LoRARequest
from vllm.sampling_params import SamplingParams
from vllm.utils import merge_async_iterators

MODEL_PATH = "meta-llama/Llama-2-7b-hf"
LORA_MODULE_DOWNLOAD_PATH = None # Populated by download_and_prepare_lora_module() #noqa
LORA_RANK = 8
DEFAULT_MAX_LORAS = 16 * 3


def download_and_prepare_lora_module():
"""
Request submission is expensive when the LoRA adapters have their own
tokenizers. This is because, for each request with a new LoRA adapter ID,
the front-end loads the tokenizer from disk.
In this test, as we are comparing request processing times, we want to
minimize any extra activity. To this effect, we download the LoRA
adapter and remove all the tokenizer files, so the engine will default
to the base model tokenizer.
"""
global LORA_MODULE_DOWNLOAD_PATH

LORA_MODULE_HF_PATH = "yard1/llama-2-7b-sql-lora-test"
LORA_MODULE_DOWNLOAD_PATH = snapshot_download(repo_id=LORA_MODULE_HF_PATH)

tokenizer_files = [
'added_tokens.json', 'tokenizer_config.json', 'tokenizer.json',
'tokenizer.model'
]
for tokenizer_file in tokenizer_files:
del_path = Path(LORA_MODULE_DOWNLOAD_PATH) / tokenizer_file
del_path.unlink()


@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass


def get_lora_requests() -> List[LoRARequest]:
lora_requests: List[LoRARequest] = [
LoRARequest(lora_name=f"{i}",
lora_int_id=i,
lora_path=LORA_MODULE_DOWNLOAD_PATH)
for i in range(1, DEFAULT_MAX_LORAS + 1)
]
return lora_requests


async def requests_processing_time(llm,
lora_requests: List[LoRARequest]) -> float:

sampling_params = SamplingParams(n=1,
temperature=0.0,
top_p=1.0,
ignore_eos=True,
max_tokens=1)

generators = []
start = time.perf_counter()

for lora_request in lora_requests:
lora_int_id = lora_request.lora_int_id
generator = llm.generate(
prompt=TextPrompt(prompt=f"hello {lora_int_id}",
multi_modal_data=None), # type: ignore
sampling_params=sampling_params,
lora_request=lora_request,
request_id=f"test{lora_int_id}")
generators.append(generator)

all_gens = merge_async_iterators(*generators)
async for i, res in all_gens:
pass

end = time.perf_counter()
return end - start


@pytest.mark.asyncio
async def test_add_lora():
"""
The add_lora function is used to pre-load some LoRA adapters into the
engine in anticipation of future requests using these adapters. To test
this functionality, we use the async engine to process some requests - We
do it twice, once with add_lora() pre-loading and once without.
We measure the request processing time in both cases and expect the time
to be lesser in the case with add_lora() calls.
"""

download_and_prepare_lora_module()

lora_requests: List[LoRARequest] = get_lora_requests()

max_loras = len(set([lr.lora_int_id for lr in lora_requests]))
# Create engine in eager-mode. Due to high max_loras, the CI can
# OOM during cuda-graph capture.
engine_args = AsyncEngineArgs(
model=MODEL_PATH,
enable_lora=True,
max_loras=max_loras,
max_lora_rank=LORA_RANK,
max_model_len=128,
gpu_memory_utilization=0.8, #avoid OOM
enforce_eager=True)

# The run_with_both_engines_lora fixture sets up the `VLLM_USE_V1`
# environment variable. reload vllm.enging.async_llm_engine as
# vllm.engine.async_llm_engine.AsyncLLMEgnine changes depending on the
# env var.
import importlib

import vllm.engine.async_llm_engine
importlib.reload(vllm.engine.async_llm_engine)
from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args)

# split lora_requests into 3 parts
part_size = len(lora_requests) // 3
dummy_run_requests = lora_requests[:part_size]
warmup_run_requests = lora_requests[part_size:part_size * 2]
cold_run_requests = lora_requests[part_size * 2:]

async with build_async_engine_client_from_engine_args(engine_args) as llm:

# Dummy run - So any 1-time functionality like triton kernel compilation
# is complete here.
await requests_processing_time(llm, dummy_run_requests)

# Run with warmup
for lr in warmup_run_requests:
await llm.add_lora(lr)
# Wait for the add_lora function to complete on the server side.
await asyncio.sleep(30)
time_with_add_lora = await requests_processing_time(
llm, warmup_run_requests)

# Run without any warmup
time_cold_start = await requests_processing_time(
llm, cold_run_requests)

print(f"time hot-start {time_with_add_lora} vs "
f"time cold-start {time_cold_start} ")

assert time_with_add_lora < time_cold_start, (
f"time_with_add_lora={time_with_add_lora}, "
f"time_cold_start={time_cold_start}"
"The engine request processing time with LoRA pre-loading "
"must be less than the version that does on-demand LoRA loading.")
1 change: 1 addition & 0 deletions vllm/v1/engine/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,3 +134,4 @@ class EngineCoreRequestType(enum.Enum):
ABORT = b'\x01'
PROFILE = b'\x02'
RESET_PREFIX_CACHE = b'\x03'
ADD_LORA = b'\x04'
8 changes: 4 additions & 4 deletions vllm/v1/engine/async_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,10 @@ async def stop_profile(self) -> None:
async def reset_prefix_cache(self) -> None:
await self.engine_core.reset_prefix_cache_async()

async def add_lora(self, lora_request: LoRARequest) -> None:
"""Load a new LoRA adapter into the engine for future requests."""
await self.engine_core.add_lora_async(lora_request)

@property
def is_running(self) -> bool:
return True
Expand All @@ -376,7 +380,3 @@ def errored(self) -> bool:
@property
def dead_error(self) -> BaseException:
return Exception() # TODO: implement

async def add_lora(self, lora_request: LoRARequest) -> None:
"""Load a new LoRA adapter into the engine for future requests."""
raise NotImplementedError("LoRA not yet supported in V1")
18 changes: 15 additions & 3 deletions vllm/v1/engine/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from vllm.config import VllmConfig
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.transformers_utils.config import (
maybe_register_config_serialize_by_value)
from vllm.utils import get_exception_traceback, zmq_socket_ctx
Expand Down Expand Up @@ -146,6 +147,9 @@ def profile(self, is_start: bool = True):
def reset_prefix_cache(self):
self.scheduler.reset_prefix_cache()

def add_lora(self, lora_request: LoRARequest) -> None:
self.model_executor.add_lora(lora_request)


class EngineCoreProc(EngineCore):
"""ZMQ-wrapper for running EngineCore in background process."""
Expand Down Expand Up @@ -262,12 +266,15 @@ def _handle_client_request(self, request_type: EngineCoreRequestType,
self.reset_prefix_cache()
elif request_type == EngineCoreRequestType.PROFILE:
self.model_executor.profile(request)
elif request_type == EngineCoreRequestType.ADD_LORA:
self.model_executor.add_lora(request)

def process_input_socket(self, input_path: str):
"""Input socket IO thread."""

# Msgpack serialization decoding.
add_request_decoder = MsgpackDecoder(EngineCoreRequest)
add_lora_decoder = MsgpackDecoder(LoRARequest)
generic_decoder = MsgpackDecoder()

with zmq_socket_ctx(input_path, zmq.constants.PULL) as socket:
Expand All @@ -277,9 +284,14 @@ def process_input_socket(self, input_path: str):
request_type = EngineCoreRequestType(bytes(type_frame.buffer))

# Deserialize the request data.
decoder = add_request_decoder if (
request_type
== EngineCoreRequestType.ADD) else generic_decoder
decoder = None
if request_type == EngineCoreRequestType.ADD:
decoder = add_request_decoder
elif request_type == EngineCoreRequestType.ADD_LORA:
decoder = add_lora_decoder
else:
decoder = generic_decoder

request = decoder.decode(data_frame.buffer)

# Push to input queue for core busy loop.
Expand Down
16 changes: 16 additions & 0 deletions vllm/v1/engine/core_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from vllm.config import VllmConfig
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.utils import (get_open_zmq_ipc_path, kill_process_tree,
make_zmq_socket)
from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
Expand Down Expand Up @@ -77,6 +78,9 @@ def reset_prefix_cache(self) -> None:
def abort_requests(self, request_ids: List[str]) -> None:
raise NotImplementedError

def add_lora(self, lora_request: LoRARequest) -> None:
raise NotImplementedError

async def get_output_async(self) -> EngineCoreOutputs:
raise NotImplementedError

Expand All @@ -92,6 +96,9 @@ async def reset_prefix_cache_async(self) -> None:
async def abort_requests_async(self, request_ids: List[str]) -> None:
raise NotImplementedError

async def add_lora_async(self, lora_request: LoRARequest) -> None:
raise NotImplementedError


class InprocClient(EngineCoreClient):
"""
Expand Down Expand Up @@ -125,6 +132,9 @@ def profile(self, is_start: bool = True) -> None:
def reset_prefix_cache(self) -> None:
self.engine_core.reset_prefix_cache()

def add_lora(self, lora_request: LoRARequest) -> None:
self.engine_core.add_lora(lora_request)


class MPClient(EngineCoreClient):
"""
Expand Down Expand Up @@ -242,6 +252,9 @@ def profile(self, is_start: bool = True) -> None:
def reset_prefix_cache(self) -> None:
self._send_input(EngineCoreRequestType.RESET_PREFIX_CACHE, None)

def add_lora(self, lora_request: LoRARequest) -> None:
self._send_input(EngineCoreRequestType.ADD_LORA, lora_request)


class AsyncMPClient(MPClient):
"""Asyncio-compatible client for multi-proc EngineCore."""
Expand Down Expand Up @@ -295,3 +308,6 @@ async def profile_async(self, is_start: bool = True) -> None:

async def reset_prefix_cache_async(self) -> None:
await self._send_input(EngineCoreRequestType.RESET_PREFIX_CACHE, None)

async def add_lora_async(self, lora_request: LoRARequest) -> None:
await self._send_input(EngineCoreRequestType.ADD_LORA, lora_request)
4 changes: 4 additions & 0 deletions vllm/v1/worker/gpu_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
init_distributed_environment,
set_custom_all_reduce)
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.model_executor import set_random_seed
from vllm.platforms import current_platform
from vllm.utils import GiB_bytes
Expand Down Expand Up @@ -234,6 +235,9 @@ def profile(self, is_start: bool = True):
else:
self.profiler.stop()

def add_lora(self, lora_request: LoRARequest) -> bool:
return self.model_runner.add_lora(lora_request)

def check_health(self) -> None:
# worker will always be healthy as long as it's running.
return
Expand Down
5 changes: 5 additions & 0 deletions vllm/v1/worker/lora_model_runner_mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,8 @@ def maybe_profile_with_lora(self, lora_config: LoRAConfig,

# __exit__ code
self.lora_manager.remove_all_adapters()

def add_lora(self, lora_request: LoRARequest) -> bool:
if not self.lora_manager:
raise RuntimeError("LoRA is not enabled.")
return self.lora_manager.add_adapter(lora_request)

0 comments on commit cbc4012

Please sign in to comment.