Enable flashinfer when group_size == 6 (#2124)

mlc-ai · Apr 12, 2024 · 880c68a · 880c68a
1 parent 9b71443
commit 880c68a
Show file tree

Hide file tree

Showing 2 changed files with 3 additions and 3 deletions.
diff --git a/python/mlc_llm/compiler_pass/dispatch_kv_cache_creation.py b/python/mlc_llm/compiler_pass/dispatch_kv_cache_creation.py
@@ -155,7 +155,7 @@ def create_flashinfer_paged_kv_cache(
                 in self.metadata["model_type"]
             )
             # filter by attention group size
-            or kwargs["num_attention_heads"] // kwargs["num_key_value_heads"] not in [1, 4, 8]
+            or kwargs["num_attention_heads"] // kwargs["num_key_value_heads"] not in [1, 4, 6, 8]
         ):
             return
 

diff --git a/python/mlc_llm/op/attention.py b/python/mlc_llm/op/attention.py
@@ -103,12 +103,12 @@ def _fallback():
         and k.dtype == "float16"
         and v.dtype == "float16"
     ):
-        if group_size not in [1, 4, 8]:
+        if group_size not in [1, 4, 6, 8]:
             global WARN_FLASHINFER_GROUP_SIZE  # pylint: disable=global-statement
             if not WARN_FLASHINFER_GROUP_SIZE:
                 WARN_FLASHINFER_GROUP_SIZE = True
                 logger.warning(
-                    "FlashInfer only supports group size in [1, 4, 8], but got %d. Skip and "
+                    "FlashInfer only supports group size in [1, 4, 6, 8], but got %d. Skip and "
                     "fallback to default implementation.",
                     group_size,
                 )