[Model] Add support for DeepSeek-V2 Model (#2972)

This PR implements the DeepSeek-V2 Model architecture: https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite/blob/main/modeling_deepseek.py. The notable changes from the common LLM architecture includes: - Multihead Latent Attention (MLA) - Yarn Rotary Positional Embeddings - DeepSeekMoE Example execution on M2 ultra: ``` % mlc_llm chat ../models/DeepSeek-V2-Lite-Chat-MLC-q0f16 --model-lib ../models/DeepSeek-V2-Lite-Chat-MLC-q 0f16/model.dylib >>> who are you? I am an AI assistant created by DeepSeek to be helpful and harmless. ``` TODO: - Currently the model architecture only supports Deepseek-V2-Lite. To support Deepseek-V2, we also need to support the `group_limited_greedy` strategy. - Support tensor parallel > 1.
mlc-ai · Oct 13, 2024 · 436e189 · 436e189
1 parent 01baf0b
commit 436e189
Show file tree

Hide file tree

Showing 10 changed files with 926 additions and 0 deletions.
diff --git a/python/mlc_llm/conversation_template/__init__.py b/python/mlc_llm/conversation_template/__init__.py
@@ -8,6 +8,7 @@
 # model preset templates
 from . import (
     cohere,
+    deepseek_v2,
     dolly,
     gemma,
     glm,

diff --git a/python/mlc_llm/conversation_template/deepseek_v2.py b/python/mlc_llm/conversation_template/deepseek_v2.py
@@ -0,0 +1,21 @@
+"""Deepseek V2 default templates"""
+
+from mlc_llm.protocol.conversation_protocol import Conversation, MessagePlaceholders
+
+from .registry import ConvTemplateRegistry
+
+# Deepseek V2
+ConvTemplateRegistry.register_conv_template(
+    Conversation(
+        name="deepseek_v2",
+        system_template=f"{MessagePlaceholders.SYSTEM.value}",
+        system_message="",
+        system_prefix_token_ids=[100000],
+        roles={"user": "User", "assistant": "Assistant"},
+        seps=["\n\n", "<｜end▁of▁sentence｜>"],
+        role_content_sep=": ",
+        role_empty_sep=":",
+        stop_str=["<｜end▁of▁sentence｜>"],
+        stop_token_ids=[100001],
+    )
+)
diff --git a/python/mlc_llm/interface/gen_config.py b/python/mlc_llm/interface/gen_config.py
@@ -304,4 +304,5 @@ def gen_config(  # pylint: disable=too-many-locals,too-many-arguments,too-many-b
     "hermes3_llama-3_1",
     "tinyllama_v1_0",
     "aya-23",
+    "deepseek_v2",
 }
diff --git a/python/mlc_llm/model/deepseek_v2/__init__.py b/python/mlc_llm/model/deepseek_v2/__init__.py
diff --git a/python/mlc_llm/model/deepseek_v2/deepseek_v2_loader.py b/python/mlc_llm/model/deepseek_v2/deepseek_v2_loader.py
@@ -0,0 +1,130 @@
+"""
+This file specifies how MLC's Deepseek-V2 parameter maps from other formats, for example HuggingFace
+PyTorch, HuggingFace safetensors.
+"""
+
+import functools
+
+import numpy as np
+
+from mlc_llm.loader import ExternMapping
+from mlc_llm.quantization import Quantization
+
+from .deepseek_v2_model import DeepseekV2Config, DeepseekV2ForCausalLM
+
+
+def huggingface(model_config: DeepseekV2Config, quantization: Quantization) -> ExternMapping:
+    """Returns a parameter mapping that maps from the names of MLC LLM parameters to
+    the names of HuggingFace PyTorch parameters.
+
+    Parameters
+    ----------
+    model_config : DeepseekV2Config
+        The configuration of the DeepseekV2 model.
+
+    quantization : Quantization
+        The quantization configuration.
+
+    Returns
+    -------
+    param_map : ExternMapping
+        The parameter mapping from MLC to HuggingFace PyTorch.
+    """
+    model = DeepseekV2ForCausalLM(model_config)
+    if quantization is not None:
+        model.to(quantization.model_dtype)
+    _, _named_params, _ = model.export_tvm(  # type: ignore[misc]
+        spec=model.get_default_spec(),
+        allow_extern=True,
+    )
+    named_parameters = dict(_named_params)
+
+    mapping = ExternMapping()
+
+    for i in range(model_config.num_hidden_layers):
+        if i >= model_config.first_k_dense_replace and i % model_config.moe_layer_freq == 0:
+            # map mlp shared expert weight
+            mlp = f"model.layers.{i}.mlp"
+            shared_expert = f"{mlp}.shared_experts"
+            mlc_name = f"{shared_expert}.gate_up_proj.weight"
+            mlc_param = named_parameters[mlc_name]
+            mapping.add_mapping(
+                mlc_name,
+                [
+                    f"{shared_expert}.gate_proj.weight",
+                    f"{shared_expert}.up_proj.weight",
+                ],
+                functools.partial(
+                    lambda gate, up, dtype: np.concatenate([gate, up], axis=0).astype(dtype),
+                    dtype=mlc_param.dtype,
+                ),
+            )
+            # map mlp moe gate and up weight
+            mlc_name = f"{mlp}.moe_gate_up_proj.weight"
+
+            def combine_expert_gate_up(*hf_params, dtype):
+                stack = []
+                for i in range(0, len(hf_params), 2):
+                    stack.append(np.concatenate([hf_params[i], hf_params[i + 1]], axis=0))
+                return np.stack(stack, axis=0).astype(dtype)
+
+            mapping.add_mapping(
+                mlc_name,
+                functools.reduce(
+                    lambda a, b: a + b,
+                    [
+                        [
+                            f"{mlp}.experts.{expert_id}.gate_proj.weight",
+                            f"{mlp}.experts.{expert_id}.up_proj.weight",
+                        ]
+                        for expert_id in range(model_config.n_routed_experts)
+                    ],
+                ),
+                functools.partial(
+                    combine_expert_gate_up,
+                    dtype=mlc_param.dtype,
+                ),
+            )
+
+            # map mlp moe gate and up weight
+            mlc_name = f"{mlp}.moe_down_proj.weight"
+            mlc_param = named_parameters[mlc_name]
+            mapping.add_mapping(
+                mlc_name,
+                [
+                    f"{mlp}.experts.{expert_id}.down_proj.weight"
+                    for expert_id in range(model_config.n_routed_experts)
+                ],
+                functools.partial(
+                    lambda *hf_params, dtype: np.stack(hf_params, axis=0).astype(dtype),
+                    dtype=mlc_param.dtype,
+                ),
+            )
+        else:
+            # map mlp weight
+            mlp = f"model.layers.{i}.mlp"
+            mlc_name = f"{mlp}.gate_up_proj.weight"
+            mlc_param = named_parameters[mlc_name]
+            mapping.add_mapping(
+                mlc_name,
+                [
+                    f"{mlp}.gate_proj.weight",
+                    f"{mlp}.up_proj.weight",
+                ],
+                functools.partial(
+                    lambda gate, up, dtype: np.concatenate([gate, up], axis=0).astype(dtype),
+                    dtype=mlc_param.dtype,
+                ),
+            )
+
+    for mlc_name, mlc_param in named_parameters.items():
+        if mlc_name not in mapping.param_map:
+            mapping.add_mapping(
+                mlc_name,
+                [mlc_name],
+                functools.partial(
+                    lambda x, dtype: x.astype(dtype),
+                    dtype=mlc_param.dtype,
+                ),
+            )
+    return mapping
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,6 +8,7 @@ @@
     # model preset templates
     from . import (
         cohere,
+        deepseek_v2,
         dolly,
         gemma,
         glm,
@@ Expand Down @@