From 390b495ff327e8548c3f7cd701afce87870d9102 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Fri, 26 Jan 2024 15:19:19 -0800 Subject: [PATCH] Don't build punica kernels by default (#2605) --- .github/workflows/scripts/build.sh | 2 ++ Dockerfile | 2 ++ setup.py | 2 +- vllm/lora/punica.py | 9 ++++++--- 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/.github/workflows/scripts/build.sh b/.github/workflows/scripts/build.sh index cf3bc11823b43..2578d448436d2 100644 --- a/.github/workflows/scripts/build.sh +++ b/.github/workflows/scripts/build.sh @@ -13,6 +13,8 @@ $python_executable -m pip install -r requirements.txt # Limit the number of parallel jobs to avoid OOM export MAX_JOBS=1 +# Make sure punica is built for the release (for LoRA) +export VLLM_INSTALL_PUNICA_KERNELS=1 # Build $python_executable setup.py bdist_wheel --dist-dir=dist diff --git a/Dockerfile b/Dockerfile index 44b1dd17d7e02..4cfcf058004c5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -45,6 +45,8 @@ ENV MAX_JOBS=${max_jobs} # number of threads used by nvcc ARG nvcc_threads=8 ENV NVCC_THREADS=$nvcc_threads +# make sure punica kernels are built (for LoRA) +ENV VLLM_INSTALL_PUNICA_KERNELS=1 RUN python3 setup.py build_ext --inplace #################### EXTENSION Build IMAGE #################### diff --git a/setup.py b/setup.py index 5a3f262c1658e..88fa495205659 100644 --- a/setup.py +++ b/setup.py @@ -265,7 +265,7 @@ def get_torch_arch_list() -> Set[str]: with contextlib.suppress(ValueError): torch_cpp_ext.COMMON_NVCC_FLAGS.remove(flag) - install_punica = bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "1"))) + install_punica = bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0"))) device_count = torch.cuda.device_count() for i in range(device_count): major, minor = torch.cuda.get_device_capability(i) diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index ac96931b2d071..bcb73ccc19b0e 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -157,10 +157,13 @@ def _raise_exc( **kwargs # pylint: disable=unused-argument ): if torch.cuda.get_device_capability() < (8, 0): - raise ImportError( - "LoRA kernels require compute capability>=8.0") from import_exc + raise ImportError("punica LoRA kernels require compute " + "capability>=8.0") from import_exc else: - raise import_exc + raise ImportError( + "punica LoRA kernels could not be imported. If you built vLLM " + "from source, make sure VLLM_INSTALL_PUNICA_KERNELS=1 env var " + "was set.") from import_exc bgmv = _raise_exc add_lora = _raise_exc