InternLM · lvhan028 · Nov 13, 2024 · Nov 7, 2024 · Nov 7, 2024 · Nov 7, 2024
diff --git a/autotest/config.yaml b/autotest/config.yaml
@@ -44,10 +44,12 @@ turbomind_chat_model:
     - Qwen/Qwen2-1.5B-Instruct
     - Qwen/Qwen1.5-7B-Chat
     - Qwen/Qwen1.5-4B-Chat-AWQ
+    - Qwen/Qwen1.5-MoE-A2.7B-Chat
     - Qwen/Qwen-VL-Chat
     - Qwen/Qwen2.5-0.5B-Instruct
     - Qwen/Qwen2.5-7B-Instruct
     - Qwen/Qwen2-7B-Instruct-GPTQ-Int4
+    - Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4
     - mistralai/Mistral-7B-Instruct-v0.3
     - mistralai/Mixtral-8x7B-Instruct-v0.1
     - lmdeploy/llama2-chat-7b-w4

diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py
@@ -50,6 +50,8 @@ class ModelConfig:
     expert_num: int = 0
     expert_inter_size: int = 0
     experts_per_token: int = 0
+    moe_shared_gate: int = False
+    moe_norm_topk: int = False
 
     def verify(self):
         invalid = {}

diff --git a/lmdeploy/turbomind/deploy/module.py b/lmdeploy/turbomind/deploy/module.py
@@ -140,14 +140,18 @@ class MoeFfn(Ffn):
     requires:
         r.moe_ffn_expert(e, i, kind)
         r.moe_ffn_gate(i)
+        r.moe_ffn_shared_gate(i)
     """
 
     _moe_ffn_expert = 'layers.{0}.moe_ffn.experts.E.{1}.{2}'
-    _moe_ffn_gate = 'layers.{0}.moe_ffn.gate.{1}'
+    _moe_ffn_gate = 'layers.{0}.moe_ffn.gate.weight'
+    _moe_ffn_shared_gate = 'layers.{0}.moe_ffn.shared_gate.weight'
 
     def __init__(self, model: BaseOutputModel):
         super().__init__(model)
         self.expert_num = model.model_config.expert_num
+        self.inter_size = model.model_config.expert_inter_size
+        self.shared_gate = model.model_config.moe_shared_gate
 
     def apply(self, i: int, r: BaseReader):
         for p in get_params(r.moe_ffn_expert()):
@@ -157,7 +161,13 @@ def apply(self, i: int, r: BaseReader):
                   i)
 
         gate = transpose(r.moe_ffn_gate(i))
-        self.model.save_split(gate, self._moe_ffn_gate.format(i, 'weight'))
+        self.model.save_split(gate, self._moe_ffn_gate.format(i))
+
+        if self.shared_gate:
+            shared_gate = transpose(r.moe_ffn_shared_gate(i))
+            # print(shared_gate)
+            self.model.save_split(shared_gate,
+                                  self._moe_ffn_shared_gate.format(i))
 
 
 class Attn(Module):
@@ -248,8 +258,11 @@ class Transformer:
 
     def __init__(self, model: BaseOutputModel):
         self.model = model
-        ffn = MoeFfn if model.model_config.expert_num else Ffn
-        modules = [Attn, LayerNorm, ffn]
+        modules = [Attn, LayerNorm]
+        if model.model_config.inter_size:
+            modules.append(Ffn)
+        if model.model_config.expert_num:
+            modules.append(MoeFfn)
         self.modules = [c(model) for c in modules]
         self.misc = Misc(model)
 

diff --git a/lmdeploy/turbomind/deploy/source_model/mixtral.py b/lmdeploy/turbomind/deploy/source_model/mixtral.py
@@ -33,4 +33,6 @@ def model_info(self):
         info['expert_num'] = cfg['num_local_experts']
         info['expert_inter_size'] = cfg['intermediate_size']
         info['experts_per_token'] = cfg['num_experts_per_tok']
+        info['moe_norm_topk'] = True
+        info['inter_size'] = 0
         return info
diff --git a/lmdeploy/turbomind/deploy/source_model/qwen.py b/lmdeploy/turbomind/deploy/source_model/qwen.py
@@ -120,3 +120,64 @@ def model_info(self):
         cfg = super().model_info()
         cfg['attn_bias'] = 1
         return cfg
+
+
+class Qwen2MoeReader(LlamaReader):
+
+    ffn_pattern = r'shared_expert\.'
+
+    def moe_ffn_expert(self, e=None, i=None, kind=None):
+        if not kind:
+            return self.filter(r'experts')
+        result = []
+        for key in ['gate', 'down', 'up']:
+            name = f'model.layers.{i}.mlp.experts.{e}.{key}_proj.{kind}'
+            tensor = self.params.get(name)
+            tensor = self.transform(tensor, kind)
+            result.append(tensor)
+        return (*result, )
+
+    def moe_ffn_gate(self, i):
+        return self.params.get(f'model.layers.{i}.mlp.gate.weight')
+
+    def _ffn(self, i: int, kind: str):
+        """Get ffn kind for layer i."""
+        if not kind:
+            return self.filter(self.ffn_pattern)
+        result = []
+        for key in ['gate', 'down', 'up']:
+            tensor = self.params[
+                f'model.layers.{i}.mlp.shared_expert.{key}_proj.{kind}']
+            tensor = self.transform(tensor, kind)
+            result.append(tensor)
+        return (*result, )
+
+    def moe_ffn_shared_gate(self, i):
+        return self.params.get(
+            f'model.layers.{i}.mlp.shared_expert_gate.weight')
+
+
+@INPUT_MODELS.register_module(name='qwen2-moe')
+class Qwen2MoeModel(LlamaModel):
+
+    Reader = Qwen2MoeReader
+
+    def tokenizer_info(self):
+        """https://huggingface.co/Qwen/Qwen1.5-7B-Chat/blob/main/generation_con
+        fig.json."""  # noqa: E501
+        n_words = 152064
+        bos_id = 151643
+        eos_id = 151645
+        return n_words, bos_id, eos_id
+
+    def model_info(self):
+        cfg = self.model_config
+        info = super().model_info()
+        info['expert_num'] = cfg['num_experts']
+        info['expert_inter_size'] = cfg['moe_intermediate_size']
+        info['experts_per_token'] = cfg['num_experts_per_tok']
+        info['inter_size'] = cfg['shared_expert_intermediate_size']
+        info['moe_shared_gate'] = True
+        info['moe_norm_topk_prob'] = cfg['norm_topk_prob']
+        info['attn_bias'] = 1
+        return info
diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py
@@ -20,6 +20,7 @@
     QWenLMHeadModel='qwen',
     # Qwen2
     Qwen2ForCausalLM='qwen2',
+    Qwen2MoeForCausalLM='qwen2-moe',
     # mistral
     MistralForCausalLM='llama',
     # llava