From ff4e478507b786463171eaaba3df13cbdd9d3285 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Date: Mon, 23 Sep 2024 21:59:46 +0000
Subject: [PATCH] With chunked prefil, for large prompts, the sampler can
 encounter a zero-sized tensor, on which skinny gemm fails

---
 vllm/model_executor/layers/tuned_gemm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/tuned_gemm.py b/vllm/model_executor/layers/tuned_gemm.py
index 583e55f36c29d..7ea1d8d93ea2b 100644
--- a/vllm/model_executor/layers/tuned_gemm.py
+++ b/vllm/model_executor/layers/tuned_gemm.py
@@ -62,7 +62,7 @@ def apply_skinny(self, m, n, k, inp_view, weights):
             return None
         if inp_view.dtype != torch.float16 or k % 8 != 0:
             return None
-        if m > 8 and n <= 4:
+        if m > 8 and 0 < n <= 4:
             out = torch.empty(inp_view.shape[0],
                               weights.shape[0],
                               dtype=inp_view.dtype,