MoE support for turbomind (#2621)

* initial moe support * dynamic grouped gemm * benchmark * moe benchmark * moe sampling * split-k * refactor tuning * simplify * n-major weight * add `num` for `MatrixLayout` * packed rows * packed cols * dispatch for packed rows * w4a16 moe * refactor model loading * fix pytorch loader * refactor * dispatch w4a16 moe * fix loader * add comment * fix msvc build * fix msvc build * fix msvc build * fix ut * fix ut * fix p-lora * add all support arches * minor * fix lint * fix lint * fix lint * fix ut * bf16 support * minor * refactor * fix lint * fix ut * minor * minor * minor * fix inter_size config * load with non-standard filenames * fix loader * fix missing default param * defer the loading of misc weights for safetensors * fix conversion * fix deepseek-vl * verify model config * pad inter size by group size and tp * fix minicpm attn bias & ignore un-needed bias * set `attn_bias` based on minicpm version
InternLM · Oct 25, 2024 · 962e760 · 962e760
1 parent 1a76efb
commit 962e760
Show file tree

Hide file tree

Showing 105 changed files with 5,703 additions and 1,772 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -44,7 +44,7 @@ repos:
     rev: v2.1.0
     hooks:
       - id: codespell
-        args: ["--skip=third_party/*,*.ipynb,*.proto,src/turbomind/kernels/gemm/transform.h,docker/Dockerfile_aarch64_ascend,docs/en/get_started/ascend/get_started.md,docs/zh_cn/get_started/ascend/get_started.md"]
+        args: ["--skip=third_party/*,*.ipynb,*.proto,src/turbomind/*,docker/Dockerfile_aarch64_ascend,docs/en/get_started/ascend/get_started.md,docs/zh_cn/get_started/ascend/get_started.md"]
 
 
   - repo: https://github.com/myint/docformatter

diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py
@@ -38,7 +38,7 @@ class ModelConfig:
     num_layer: int = None
     inter_size: int = None
     norm_eps: float = None
-    attn_bias: int = None
+    attn_bias: int = 0
     start_id: int = None
     end_id: int = None
     size_per_head: int = 128
@@ -47,6 +47,16 @@ class ModelConfig:
     session_len: int = None
     tp: int = 1
     model_format: str = 'hf'
+    expert_num: int = 0
+    expert_inter_size: int = 0
+    experts_per_token: int = 0
+
+    def verify(self):
+        invalid = {}
+        for k, v in self.__dict__.items():
+            if v is None:
+                invalid[k] = v
+        assert not invalid, f'incomplete model config: {invalid}'
 
 
 @dataclass

diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py
@@ -14,7 +14,7 @@
 from ...utils import _get_and_verify_max_len, is_bf16_supported
 from ..supported_models import SUPPORTED_ARCHS, is_supported
 from .config import TurbomindModelConfig
-from .exporter import get_exporter_factory
+from .module import Transformer
 from .policy import get_input_policy
 from .source_model.base import INPUT_MODELS
 from .target_model.base import OUTPUT_MODELS
@@ -99,7 +99,6 @@ def get_output_model_registered_name_and_config(model_path: str,
         group_size (int): the size of group used by awq model
     """
     register_name = 'tm'
-    turbomind_model_arch = 'llama'
     weight_type = 'float16'
 
     config = TurbomindModelConfig.from_dict()
@@ -108,7 +107,6 @@ def get_output_model_registered_name_and_config(model_path: str,
         session_len = 2048
     else:  # hf, awq, None
         model_arch, model_config = get_model_arch(model_path)
-        turbomind_model_arch = SUPPORTED_ARCHS[model_arch]
         session_len = _get_and_verify_max_len(model_config, None)
         if model_format in ['awq', 'gptq']:
             weight_type = 'int4'
@@ -148,11 +146,7 @@ def get_output_model_registered_name_and_config(model_path: str,
     config.model_config.group_size = group_size
     config.model_config.session_len = session_len
 
-    lora_type = 'plora' if turbomind_model_arch == 'xcomposer2' else ''
-
-    exporter_factory = get_exporter_factory(weight_type, lora_type)
-
-    return register_name, config, exporter_factory
+    return register_name, config
 
 
 def pack_model_repository(workspace_path: str):
@@ -264,7 +258,7 @@ def get_tm_model(model_path,
                                                      tokenizer_path=model_path,
                                                      input_policy=input_policy)
 
-    output_model_name, tm_cfg, exporter_factory = \
+    output_model_name, tm_cfg = \
         get_output_model_registered_name_and_config(
             model_path=model_path,
             model_format=engine_config.model_format,
@@ -278,7 +272,7 @@ def get_tm_model(model_path,
     output_model = OUTPUT_MODELS.get(output_model_name)(
         input_model=input_model,
         cfg=tm_cfg,
-        exporter_factory=exporter_factory,
+        model_cls=Transformer,
         out_dir=out_dir)
 
     return output_model

diff --git a/lmdeploy/turbomind/deploy/exporter.py b/lmdeploy/turbomind/deploy/exporter.py