From ff4e478507b786463171eaaba3df13cbdd9d3285 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Mon, 23 Sep 2024 21:59:46 +0000 Subject: [PATCH] With chunked prefil, for large prompts, the sampler can encounter a zero-sized tensor, on which skinny gemm fails --- vllm/model_executor/layers/tuned_gemm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/tuned_gemm.py b/vllm/model_executor/layers/tuned_gemm.py index 583e55f36c29d..7ea1d8d93ea2b 100644 --- a/vllm/model_executor/layers/tuned_gemm.py +++ b/vllm/model_executor/layers/tuned_gemm.py @@ -62,7 +62,7 @@ def apply_skinny(self, m, n, k, inp_view, weights): return None if inp_view.dtype != torch.float16 or k % 8 != 0: return None - if m > 8 and n <= 4: + if m > 8 and 0 < n <= 4: out = torch.empty(inp_view.shape[0], weights.shape[0], dtype=inp_view.dtype,