From 390b495ff327e8548c3f7cd701afce87870d9102 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Fri, 26 Jan 2024 15:19:19 -0800
Subject: [PATCH] Don't build punica kernels by default (#2605)

---
 .github/workflows/scripts/build.sh | 2 ++
 Dockerfile                         | 2 ++
 setup.py                           | 2 +-
 vllm/lora/punica.py                | 9 ++++++---
 4 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/scripts/build.sh b/.github/workflows/scripts/build.sh
index cf3bc11823b43..2578d448436d2 100644
--- a/.github/workflows/scripts/build.sh
+++ b/.github/workflows/scripts/build.sh
@@ -13,6 +13,8 @@ $python_executable -m pip install -r requirements.txt
 
 # Limit the number of parallel jobs to avoid OOM
 export MAX_JOBS=1
+# Make sure punica is built for the release (for LoRA)
+export VLLM_INSTALL_PUNICA_KERNELS=1
 
 # Build
 $python_executable setup.py bdist_wheel --dist-dir=dist
diff --git a/Dockerfile b/Dockerfile
index 44b1dd17d7e02..4cfcf058004c5 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -45,6 +45,8 @@ ENV MAX_JOBS=${max_jobs}
 # number of threads used by nvcc
 ARG nvcc_threads=8
 ENV NVCC_THREADS=$nvcc_threads
+# make sure punica kernels are built (for LoRA)
+ENV VLLM_INSTALL_PUNICA_KERNELS=1
 
 RUN python3 setup.py build_ext --inplace
 #################### EXTENSION Build IMAGE ####################
diff --git a/setup.py b/setup.py
index 5a3f262c1658e..88fa495205659 100644
--- a/setup.py
+++ b/setup.py
@@ -265,7 +265,7 @@ def get_torch_arch_list() -> Set[str]:
         with contextlib.suppress(ValueError):
             torch_cpp_ext.COMMON_NVCC_FLAGS.remove(flag)
 
-    install_punica = bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "1")))
+    install_punica = bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0")))
     device_count = torch.cuda.device_count()
     for i in range(device_count):
         major, minor = torch.cuda.get_device_capability(i)
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index ac96931b2d071..bcb73ccc19b0e 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -157,10 +157,13 @@ def _raise_exc(
         **kwargs  # pylint: disable=unused-argument
     ):
         if torch.cuda.get_device_capability() < (8, 0):
-            raise ImportError(
-                "LoRA kernels require compute capability>=8.0") from import_exc
+            raise ImportError("punica LoRA kernels require compute "
+                              "capability>=8.0") from import_exc
         else:
-            raise import_exc
+            raise ImportError(
+                "punica LoRA kernels could not be imported. If you built vLLM "
+                "from source, make sure VLLM_INSTALL_PUNICA_KERNELS=1 env var "
+                "was set.") from import_exc
 
     bgmv = _raise_exc
     add_lora = _raise_exc