Skip to content

Commit

Permalink
[Model] Add support for DeepSeek-V2 Model (#2972)
Browse files Browse the repository at this point in the history
This PR implements the DeepSeek-V2 Model architecture:
https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite/blob/main/modeling_deepseek.py.

The notable changes from the common LLM architecture includes:
- Multihead Latent Attention (MLA)
- Yarn Rotary Positional Embeddings
- DeepSeekMoE

Example execution on M2 ultra:
```
% mlc_llm chat ../models/DeepSeek-V2-Lite-Chat-MLC-q0f16 --model-lib ../models/DeepSeek-V2-Lite-Chat-MLC-q
0f16/model.dylib
>>> who are you?
 I am an AI assistant created by DeepSeek to be helpful and harmless.
```

TODO:
- Currently the model architecture only supports Deepseek-V2-Lite.
To support Deepseek-V2, we also need to support the `group_limited_greedy`
strategy.
- Support tensor parallel > 1.
  • Loading branch information
rickzx authored Oct 13, 2024
1 parent 01baf0b commit 436e189
Show file tree
Hide file tree
Showing 10 changed files with 926 additions and 0 deletions.
1 change: 1 addition & 0 deletions python/mlc_llm/conversation_template/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
# model preset templates
from . import (
cohere,
deepseek_v2,
dolly,
gemma,
glm,
Expand Down
21 changes: 21 additions & 0 deletions python/mlc_llm/conversation_template/deepseek_v2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
"""Deepseek V2 default templates"""

from mlc_llm.protocol.conversation_protocol import Conversation, MessagePlaceholders

from .registry import ConvTemplateRegistry

# Deepseek V2
ConvTemplateRegistry.register_conv_template(
Conversation(
name="deepseek_v2",
system_template=f"{MessagePlaceholders.SYSTEM.value}",
system_message="",
system_prefix_token_ids=[100000],
roles={"user": "User", "assistant": "Assistant"},
seps=["\n\n", "<|end▁of▁sentence|>"],
role_content_sep=": ",
role_empty_sep=":",
stop_str=["<|end▁of▁sentence|>"],
stop_token_ids=[100001],
)
)
1 change: 1 addition & 0 deletions python/mlc_llm/interface/gen_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,4 +304,5 @@ def gen_config( # pylint: disable=too-many-locals,too-many-arguments,too-many-b
"hermes3_llama-3_1",
"tinyllama_v1_0",
"aya-23",
"deepseek_v2",
}
Empty file.
130 changes: 130 additions & 0 deletions python/mlc_llm/model/deepseek_v2/deepseek_v2_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
"""
This file specifies how MLC's Deepseek-V2 parameter maps from other formats, for example HuggingFace
PyTorch, HuggingFace safetensors.
"""

import functools

import numpy as np

from mlc_llm.loader import ExternMapping
from mlc_llm.quantization import Quantization

from .deepseek_v2_model import DeepseekV2Config, DeepseekV2ForCausalLM


def huggingface(model_config: DeepseekV2Config, quantization: Quantization) -> ExternMapping:
"""Returns a parameter mapping that maps from the names of MLC LLM parameters to
the names of HuggingFace PyTorch parameters.
Parameters
----------
model_config : DeepseekV2Config
The configuration of the DeepseekV2 model.
quantization : Quantization
The quantization configuration.
Returns
-------
param_map : ExternMapping
The parameter mapping from MLC to HuggingFace PyTorch.
"""
model = DeepseekV2ForCausalLM(model_config)
if quantization is not None:
model.to(quantization.model_dtype)
_, _named_params, _ = model.export_tvm( # type: ignore[misc]
spec=model.get_default_spec(),
allow_extern=True,
)
named_parameters = dict(_named_params)

mapping = ExternMapping()

for i in range(model_config.num_hidden_layers):
if i >= model_config.first_k_dense_replace and i % model_config.moe_layer_freq == 0:
# map mlp shared expert weight
mlp = f"model.layers.{i}.mlp"
shared_expert = f"{mlp}.shared_experts"
mlc_name = f"{shared_expert}.gate_up_proj.weight"
mlc_param = named_parameters[mlc_name]
mapping.add_mapping(
mlc_name,
[
f"{shared_expert}.gate_proj.weight",
f"{shared_expert}.up_proj.weight",
],
functools.partial(
lambda gate, up, dtype: np.concatenate([gate, up], axis=0).astype(dtype),
dtype=mlc_param.dtype,
),
)
# map mlp moe gate and up weight
mlc_name = f"{mlp}.moe_gate_up_proj.weight"

def combine_expert_gate_up(*hf_params, dtype):
stack = []
for i in range(0, len(hf_params), 2):
stack.append(np.concatenate([hf_params[i], hf_params[i + 1]], axis=0))
return np.stack(stack, axis=0).astype(dtype)

mapping.add_mapping(
mlc_name,
functools.reduce(
lambda a, b: a + b,
[
[
f"{mlp}.experts.{expert_id}.gate_proj.weight",
f"{mlp}.experts.{expert_id}.up_proj.weight",
]
for expert_id in range(model_config.n_routed_experts)
],
),
functools.partial(
combine_expert_gate_up,
dtype=mlc_param.dtype,
),
)

# map mlp moe gate and up weight
mlc_name = f"{mlp}.moe_down_proj.weight"
mlc_param = named_parameters[mlc_name]
mapping.add_mapping(
mlc_name,
[
f"{mlp}.experts.{expert_id}.down_proj.weight"
for expert_id in range(model_config.n_routed_experts)
],
functools.partial(
lambda *hf_params, dtype: np.stack(hf_params, axis=0).astype(dtype),
dtype=mlc_param.dtype,
),
)
else:
# map mlp weight
mlp = f"model.layers.{i}.mlp"
mlc_name = f"{mlp}.gate_up_proj.weight"
mlc_param = named_parameters[mlc_name]
mapping.add_mapping(
mlc_name,
[
f"{mlp}.gate_proj.weight",
f"{mlp}.up_proj.weight",
],
functools.partial(
lambda gate, up, dtype: np.concatenate([gate, up], axis=0).astype(dtype),
dtype=mlc_param.dtype,
),
)

for mlc_name, mlc_param in named_parameters.items():
if mlc_name not in mapping.param_map:
mapping.add_mapping(
mlc_name,
[mlc_name],
functools.partial(
lambda x, dtype: x.astype(dtype),
dtype=mlc_param.dtype,
),
)
return mapping
Loading

0 comments on commit 436e189

Please sign in to comment.