InternLM · lvhan028 · Nov 14, 2024 · Sep 2, 2024 · Sep 29, 2024 · Sep 30, 2024
diff --git a/lmdeploy/archs.py b/lmdeploy/archs.py
@@ -121,7 +121,8 @@ def check_vl_llm(config: dict) -> bool:
         'InternVLChatModel', 'MiniGeminiLlamaForCausalLM',
         'MGMLlamaForCausalLM', 'MiniCPMV', 'LlavaForConditionalGeneration',
         'LlavaNextForConditionalGeneration', 'Phi3VForCausalLM',
-        'Qwen2VLForConditionalGeneration', 'MllamaForConditionalGeneration'
+        'Qwen2VLForConditionalGeneration', 'MllamaForConditionalGeneration',
+        'MolmoForCausalLM'
     ])
     if arch == 'QWenLMHeadModel' and 'visual' in config:
         return True

diff --git a/lmdeploy/model.py b/lmdeploy/model.py
@@ -1729,6 +1729,37 @@ def match(cls, model_path: str) -> Optional[str]:
             return 'internvl-phi3'
 
 
+@MODELS.register_module(name='molmo')
+class Molmo(BaseChatTemplate):
+
+    def __init__(self,
+                 user='User: ',
+                 eoh=' ',
+                 assistant='Assistant:',
+                 eoa='',
+                 separator=' ',
+                 stop_words=['<|endoftext|>'],
+                 **kwargs):
+        super().__init__(user=user,
+                         eoh=eoh,
+                         assistant=assistant,
+                         eoa=eoa,
+                         separator=separator,
+                         stop_words=stop_words,
+                         **kwargs)
+
+    @classmethod
+    def match(cls, model_path: str) -> Optional[str]:
+        """Return the model_name that was registered to MODELS.
+
+        Args:
+            model_path (str): the model path used for matching.
+        """
+        path = model_path.lower()
+        if 'molmo' in path:
+            return 'molmo'
+
+
 def best_match_model(query: str) -> Optional[str]:
     """Get the model that matches the query.
 

diff --git a/lmdeploy/serve/vl_async_engine.py b/lmdeploy/serve/vl_async_engine.py
@@ -64,6 +64,7 @@ async def _get_prompt_input(self,
         results = {}
         input_ids = []
         from lmdeploy.vl.templates import (MllamaTempateWrapper,
+                                           MolmoChatTemplateWrapper,
                                            Qwen2VLChatTemplateWrapper)
         ranges = None
         grid_thws = None
@@ -99,6 +100,10 @@ async def _get_prompt_input(self,
                     results['cross_attention_states'] = features[0]
                     return results
 
+                if isinstance(self.vl_prompt_template,
+                              MolmoChatTemplateWrapper):
+                    return features[0]
+
             features = [x.cpu().numpy() for x in features]
             input_ids = []
             begins = []

diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py
@@ -35,6 +35,13 @@ class ModelConfig:
     kv_head_num: int = None
     hidden_units: int = None
     vocab_size: int = None
+    # Turbomind used to assume token_embedding and lm_head has the same size
+    # at vocab dim, i.e. `vocab_size`
+    # But in molmo, embedding.shape is [vocab_size + 128, hidden_units]
+    # while lm_head shape is [hidden_units, vocab_size].
+    # Therefore, we add a new attr "embedding_size" to represent the vocab dim
+    # of token_embedding
+    embedding_size: int = 0
     num_layer: int = None
     inter_size: int = None
     norm_eps: float = None

diff --git a/lmdeploy/turbomind/deploy/source_model/__init__.py b/lmdeploy/turbomind/deploy/source_model/__init__.py
@@ -8,5 +8,6 @@
 from .meta_llama import MetaLlamaModel  # noqa: F401
 from .minicpmv import MiniCPMVModel  # noqa: F401
 from .mixtral import MixtralModel  # noqa: F401
+from .molmo import MolmoModel  # noqa: F401
 from .qwen import QwenModel  # noqa: F401
 from .xcomposer2 import Xcomposer2Model  # noqa: F401
diff --git a/lmdeploy/turbomind/deploy/source_model/molmo.py b/lmdeploy/turbomind/deploy/source_model/molmo.py
@@ -0,0 +1,122 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+
+import torch
+
+from .base import INPUT_MODELS
+from .llama import LlamaModel, LlamaReader
+
+
+class MolmoReader(LlamaReader):
+    attn_layer_prefix = 'model.transformer.blocks'
+    attn_layer_patten = r'model.transformer.blocks.([0-9]+).'
+    norm_weight_key = 'model.transformer.ln_f.weight'
+    output_weight_key = 'model.transformer.ff_out.weight'
+
+    # In molmo, names of attention parameters are "att_proj.bias",
+    # "att_proj.weight", "attn_norm.weight", "attn_out.weight", and names
+    # of ffn parameters are "ff_norm", "ff_out", "ff_proj", so we
+    # make the patterns are r'att' and r'ffn_', respectively.
+    attn_pattern = r'att'
+    ffn_pattern = r'ff_'
+
+    def tok_embeddings(self):
+        embed1 = self.params.get('model.transformer.wte.embedding', None)
+        embed2 = self.params.get('model.transformer.wte.new_embedding', None)
+        if embed1 is not None and embed2 is not None:
+            return torch.cat((embed1, embed2), dim=0)
+        else:
+            assert embed1 is None and embed2 is None
+            return None
+
+    def attn_norm(self, i: int):
+        """Get attn norm for layer i."""
+        return self.params[f'{self.attn_layer_prefix}.{i}.attn_norm.weight']
+
+    def _attn(self, i: int, kind: str):
+        """Get q, k, v, o kind(weight, bias, qweight) for layer i.
+
+        Args:
+            i (int): layer id
+            kind (str): can be one of ["weight", "bias", "qweight"]
+        """
+        q, k, v = (None, ) * 3
+        hidden_size = self.model_cfg['hidden_size']
+        head_num = self.model_cfg['num_attention_heads']
+        kv_head_num = self.model_cfg['num_key_value_heads']
+        head_dim = hidden_size // head_num
+        assert head_dim == 128
+        fused_dims = (hidden_size, kv_head_num * head_dim,
+                      kv_head_num * head_dim)
+        qkv = self.params.get(f'{self.attn_layer_prefix}.{i}.att_proj.{kind}')
+        qkv = self.transform(qkv, kind)
+        if qkv is not None:
+            q, k, v = qkv.split(fused_dims, dim=0)
+        o = self.params.get(f'{self.attn_layer_prefix}.{i}.attn_out.{kind}')
+        o = self.transform(o, kind)
+        if o is None:  # handle the case when qkv has bias but o doesn't
+            o = torch.zeros_like(q)
+        return (q, k, v, o)
+
+    def _ffn(self, i: int, kind: str):
+        """Get ffn kind(weight, qweight) for layer i."""
+        up_and_gate = self.params[
+            f'{self.attn_layer_prefix}.{i}.ff_proj.{kind}']
+        up_and_gate = self.transform(up_and_gate, kind)
+        gate, up = up_and_gate.chunk(2, dim=0)
+        down = self.params[f'{self.attn_layer_prefix}.{i}.ff_out.{kind}']
+        down = self.transform(down, kind)
+        return (up, down, gate)
+
+    def ffn_norm(self, i: int):
+        """Get ffn norm for layer i."""
+        return self.params[f'{self.attn_layer_prefix}.{i}.ff_norm.weight']
+
+
+@INPUT_MODELS.register_module(name='molmo')
+class MolmoModel(LlamaModel):
+
+    Reader = MolmoReader
+
+    def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
+        super().__init__(model_path, tokenizer_path, **kwargs)
+        config_path = osp.join(self.model_path, 'config.json')
+        with open(config_path) as f:
+            self.config = json.load(f)
+
+    def tokenizer_info(self):
+
+        n_words = 152064
+        bos_id = 151643
+        eos_id = 151643
+        return n_words, bos_id, eos_id
+
+    def model_info(self):
+        config = self.config
+        num_layer = config['num_hidden_layers']
+        norm_eps = config['layer_norm_eps']
+        attn_head_num = config['num_attention_heads']
+        kv_head_num = config['num_key_value_heads']
+        hidden_units = config['hidden_size']
+        rope_theta = config['rope_theta']
+        max_position_embeddings = config['max_position_embeddings']
+        vocab_size = config['vocab_size']
+        # https://huggingface.co/allenai/Molmo-7B-D-0924/blob/main/modeling_molmo.py#L2041
+        additional_vocab_size = 128
+        inter_size = config['intermediate_size'] // 2
+        attn_bias = config['qkv_bias']
+        return dict(
+            num_layer=num_layer,
+            norm_eps=norm_eps,
+            head_num=attn_head_num,
+            kv_head_num=kv_head_num,
+            hidden_units=hidden_units,
+            attn_bias=int(attn_bias),
+            inter_size=inter_size,
+            vocab_size=vocab_size,
+            # https://huggingface.co/allenai/Molmo-7B-D-0924/blob/main/modeling_molmo.py#L564
+            embedding_size=vocab_size + additional_vocab_size,
+            rope_theta=rope_theta,
+            max_position_embeddings=max_position_embeddings,
+        )
diff --git a/lmdeploy/turbomind/deploy/target_model/base.py b/lmdeploy/turbomind/deploy/target_model/base.py
@@ -91,6 +91,9 @@ def update_model_config(self):
         final_cfg = config_to_dict(self.model_config)
         final_cfg.update(dict(start_id=bos_id, end_id=eos_id))
         final_cfg.update(self.input_model_info)
+        if 'embedding_size' not in self.input_model_info.keys():
+            final_cfg.update(
+                embedding_size=self.input_model_info['vocab_size'])
 
         self.model_config = config_from_dict(ModelConfig, final_cfg)
 

diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py
@@ -40,7 +40,9 @@
     ChatGLMModel='glm4',
     ChatGLMForConditionalGeneration='glm4',
     # mixtral
-    MixtralForCausalLM='mixtral')
+    MixtralForCausalLM='mixtral',
+    MolmoForCausalLM='molmo',
+)
 
 
 def is_supported(model_path: str):

diff --git a/lmdeploy/vl/model/builder.py b/lmdeploy/vl/model/builder.py
@@ -18,6 +18,7 @@
 from .mini_gemeni import MiniGeminiVisionModel  # noqa F401
 from .minicpmv import MiniCPMVModel  # noqa F401
 from .mllama import MllamaVLModel  # noqa F401
+from .molmo import MolmoVisionModel  # noqa F401
 from .phi3_vision import Phi3VisionModel  # noqa F401
 from .qwen import QwenVisionModel  # noqa F401
 from .qwen2 import Qwen2VLModel  # noqa F401
@@ -31,7 +32,14 @@ def load_vl_model(model_path: str,
                   with_llm: bool = False,
                   backend_config: Optional[Union[TurbomindEngineConfig,
                                                  PytorchEngineConfig]] = None):
-    """load visual model."""
+    """load visual model.
+
+    Args:
+        model_path(str): the path or repo_id from model hub of the model
+        with_llm(bool): whether to remove the LLM part from the model.
+            When it is False, it means removing LLM part
+        backend_config: the config of the inference engine
+    """
     if not os.path.exists(model_path):
         revision = getattr(backend_config, 'revision', None)
         download_dir = getattr(backend_config, 'download_dir', None)