Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support molmo in turbomind #2716

Merged
merged 71 commits into from
Nov 14, 2024
Merged
Show file tree
Hide file tree
Changes from 65 commits
Commits
Show all changes
71 commits
Select commit Hold shift + click to select a range
81458e9
initial moe support
lzhangzz Sep 2, 2024
de31050
dynamic grouped gemm
lzhangzz Sep 29, 2024
9b71f34
benchmark
lzhangzz Sep 30, 2024
8538b2c
moe benchmark
lzhangzz Oct 8, 2024
3b731e2
moe sampling
lzhangzz Oct 8, 2024
e3c2faa
split-k
lzhangzz Oct 9, 2024
e99848b
refactor tuning
lzhangzz Oct 10, 2024
2b67a53
simplify
lzhangzz Oct 10, 2024
5176288
n-major weight
lzhangzz Oct 10, 2024
018c338
add `num` for `MatrixLayout`
lzhangzz Oct 10, 2024
fee20be
packed rows
lzhangzz Oct 11, 2024
9fcaabc
packed cols
lzhangzz Oct 11, 2024
815c581
dispatch for packed rows
lzhangzz Oct 14, 2024
a4a81d9
w4a16 moe
lzhangzz Oct 14, 2024
1db7fe1
refactor model loading
lzhangzz Oct 15, 2024
04fd8b4
fix pytorch loader
lzhangzz Oct 16, 2024
b3ceb17
refactor
lzhangzz Oct 17, 2024
c8d4ed5
dispatch w4a16 moe
lzhangzz Oct 18, 2024
3e355df
fix loader
lzhangzz Oct 18, 2024
da97d5f
add comment
lzhangzz Oct 18, 2024
7a4d6fb
Merge remote-tracking branch 'origin/main' into moe
lzhangzz Oct 18, 2024
4ca33a2
fix msvc build
lzhangzz Oct 18, 2024
75cf858
fix msvc build
lzhangzz Oct 18, 2024
ad26ada
fix msvc build
lzhangzz Oct 18, 2024
ce59d29
fix ut
lzhangzz Oct 18, 2024
36a4e4e
fix ut
lzhangzz Oct 18, 2024
bd089a9
fix p-lora
lzhangzz Oct 18, 2024
ab49732
add all support arches
lzhangzz Oct 21, 2024
7a6f6e4
minor
lzhangzz Oct 21, 2024
7c8148c
fix lint
lzhangzz Oct 21, 2024
918f1e7
fix lint
lzhangzz Oct 21, 2024
0642ebd
fix lint
lzhangzz Oct 21, 2024
c2bf33e
fix ut
lzhangzz Oct 21, 2024
783b266
bf16 support
lzhangzz Oct 21, 2024
07e15f9
minor
lzhangzz Oct 21, 2024
d12d64c
checkin molmo conversion
lvhan028 Oct 22, 2024
85edbf1
add chat template
lvhan028 Oct 22, 2024
a8bfb12
refactor
lzhangzz Oct 22, 2024
25a3320
fix lint
lzhangzz Oct 22, 2024
792c412
fix ut
lzhangzz Oct 22, 2024
4f875fb
Just for test: hardcode vocab_size
lvhan028 Oct 22, 2024
b2c858b
minor
lzhangzz Oct 22, 2024
c98ca7f
minor
lzhangzz Oct 22, 2024
504093a
minor
lzhangzz Oct 22, 2024
4769ef8
fix inter_size config
lzhangzz Oct 23, 2024
b1fa486
load with non-standard filenames
lzhangzz Oct 23, 2024
678fbed
fix loader
lzhangzz Oct 23, 2024
427c4f2
merge pr#2621
lvhan028 Oct 23, 2024
5e1c9d7
fix missing default param
lzhangzz Oct 23, 2024
1c66685
defer the loading of misc weights for safetensors
lzhangzz Oct 23, 2024
c0635f7
add embedding_size
lvhan028 Oct 23, 2024
bd013e6
update
lvhan028 Oct 23, 2024
045022b
Merge branch 'PR-2621' into support-molmo
lvhan028 Oct 23, 2024
e29c72b
update
lvhan028 Oct 23, 2024
9b68a68
tmp
lvhan028 Oct 23, 2024
0a744fa
tmp
lvhan028 Oct 24, 2024
55c32e1
merge main
lvhan028 Oct 28, 2024
62260a2
Merge branch 'main' into support-molmo
lvhan028 Oct 29, 2024
d548056
update molmo template
lvhan028 Oct 30, 2024
e3c7e77
vision embedding
lvhan028 Nov 5, 2024
2e1aea5
fix
lvhan028 Nov 5, 2024
c155963
Merge branch 'main' into support-molmo
lvhan028 Nov 5, 2024
8d8f8b9
update
lvhan028 Nov 5, 2024
0bfcae6
Merge branch 'main' into support-molmo
lvhan028 Nov 5, 2024
fea887c
fix
lvhan028 Nov 5, 2024
47c00d2
Merge branch 'main' into support-molmo
lvhan028 Nov 11, 2024
ce1f229
fix messages2prompt in templates
lvhan028 Nov 11, 2024
6fbb28b
fix order of out_messages
lvhan028 Nov 11, 2024
262f548
fix
lvhan028 Nov 12, 2024
7d200bf
add user guide
lvhan028 Nov 13, 2024
bf76d43
update is_supported
lvhan028 Nov 13, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion lmdeploy/archs.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,8 @@ def check_vl_llm(config: dict) -> bool:
'InternVLChatModel', 'MiniGeminiLlamaForCausalLM',
'MGMLlamaForCausalLM', 'MiniCPMV', 'LlavaForConditionalGeneration',
'LlavaNextForConditionalGeneration', 'Phi3VForCausalLM',
'Qwen2VLForConditionalGeneration', 'MllamaForConditionalGeneration'
'Qwen2VLForConditionalGeneration', 'MllamaForConditionalGeneration',
'MolmoForCausalLM'
])
if arch == 'QWenLMHeadModel' and 'visual' in config:
return True
Expand Down
31 changes: 31 additions & 0 deletions lmdeploy/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -1729,6 +1729,37 @@ def match(cls, model_path: str) -> Optional[str]:
return 'internvl-phi3'


@MODELS.register_module(name='molmo')
class Molmo(BaseChatTemplate):

def __init__(self,
user='User: ',
eoh=' ',
assistant='Assistant:',
eoa='',
separator=' ',
stop_words=['<|endoftext|>'],
**kwargs):
super().__init__(user=user,
eoh=eoh,
assistant=assistant,
eoa=eoa,
separator=separator,
stop_words=stop_words,
**kwargs)

@classmethod
def match(cls, model_path: str) -> Optional[str]:
"""Return the model_name that was registered to MODELS.

Args:
model_path (str): the model path used for matching.
"""
path = model_path.lower()
if 'molmo' in path:
return 'molmo'


def best_match_model(query: str) -> Optional[str]:
"""Get the model that matches the query.

Expand Down
5 changes: 5 additions & 0 deletions lmdeploy/serve/vl_async_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ async def _get_prompt_input(self,
results = {}
input_ids = []
from lmdeploy.vl.templates import (MllamaTempateWrapper,
MolmoChatTemplateWrapper,
Qwen2VLChatTemplateWrapper)
ranges = None
grid_thws = None
Expand Down Expand Up @@ -99,6 +100,10 @@ async def _get_prompt_input(self,
results['cross_attention_states'] = features[0]
return results

if isinstance(self.vl_prompt_template,
MolmoChatTemplateWrapper):
return features[0]

features = [x.cpu().numpy() for x in features]
input_ids = []
begins = []
Expand Down
7 changes: 7 additions & 0 deletions lmdeploy/turbomind/deploy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,13 @@ class ModelConfig:
kv_head_num: int = None
hidden_units: int = None
vocab_size: int = None
# Turbomind used to assume token_embedding and lm_head has the same size
# at vocab dim, i.e. `vocab_size`
# But in molmo, embedding.shape is [vocab_size + 128, hidden_units]
# while lm_head shape is [hidden_units, vocab_size].
# Therefore, we add a new attr "embedding_size" to represent the vocab dim
# of token_embedding
embedding_size: int = 0
num_layer: int = None
inter_size: int = None
norm_eps: float = None
Expand Down
1 change: 1 addition & 0 deletions lmdeploy/turbomind/deploy/source_model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,6 @@
from .meta_llama import MetaLlamaModel # noqa: F401
from .minicpmv import MiniCPMVModel # noqa: F401
from .mixtral import MixtralModel # noqa: F401
from .molmo import MolmoModel # noqa: F401
from .qwen import QwenModel # noqa: F401
from .xcomposer2 import Xcomposer2Model # noqa: F401
122 changes: 122 additions & 0 deletions lmdeploy/turbomind/deploy/source_model/molmo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
# Copyright (c) OpenMMLab. All rights reserved.
import json
import os.path as osp

import torch

from .base import INPUT_MODELS
from .llama import LlamaModel, LlamaReader


class MolmoReader(LlamaReader):
attn_layer_prefix = 'model.transformer.blocks'
attn_layer_patten = r'model.transformer.blocks.([0-9]+).'
norm_weight_key = 'model.transformer.ln_f.weight'
output_weight_key = 'model.transformer.ff_out.weight'

# In molmo, names of attention parameters are "att_proj.bias",
# "att_proj.weight", "attn_norm.weight", "attn_out.weight", and names
# of ffn parameters are "ff_norm", "ff_out", "ff_proj", so we
# make the patterns are r'att' and r'ffn_', respectively.
attn_pattern = r'att'
ffn_pattern = r'ff_'

def tok_embeddings(self):
embed1 = self.params.get('model.transformer.wte.embedding', None)
embed2 = self.params.get('model.transformer.wte.new_embedding', None)
if embed1 is not None and embed2 is not None:
return torch.cat((embed1, embed2), dim=0)
else:
assert embed1 is None and embed2 is None
return None

def attn_norm(self, i: int):
"""Get attn norm for layer i."""
return self.params[f'{self.attn_layer_prefix}.{i}.attn_norm.weight']

def _attn(self, i: int, kind: str):
"""Get q, k, v, o kind(weight, bias, qweight) for layer i.

Args:
i (int): layer id
kind (str): can be one of ["weight", "bias", "qweight"]
"""
q, k, v = (None, ) * 3
hidden_size = self.model_cfg['hidden_size']
head_num = self.model_cfg['num_attention_heads']
kv_head_num = self.model_cfg['num_key_value_heads']
head_dim = hidden_size // head_num
assert head_dim == 128
fused_dims = (hidden_size, kv_head_num * head_dim,
kv_head_num * head_dim)
qkv = self.params.get(f'{self.attn_layer_prefix}.{i}.att_proj.{kind}')
qkv = self.transform(qkv, kind)
if qkv is not None:
q, k, v = qkv.split(fused_dims, dim=0)
o = self.params.get(f'{self.attn_layer_prefix}.{i}.attn_out.{kind}')
o = self.transform(o, kind)
if o is None: # handle the case when qkv has bias but o doesn't
o = torch.zeros_like(q)
return (q, k, v, o)

def _ffn(self, i: int, kind: str):
"""Get ffn kind(weight, qweight) for layer i."""
up_and_gate = self.params[
f'{self.attn_layer_prefix}.{i}.ff_proj.{kind}']
up_and_gate = self.transform(up_and_gate, kind)
gate, up = up_and_gate.chunk(2, dim=0)
down = self.params[f'{self.attn_layer_prefix}.{i}.ff_out.{kind}']
down = self.transform(down, kind)
return (up, down, gate)

def ffn_norm(self, i: int):
"""Get ffn norm for layer i."""
return self.params[f'{self.attn_layer_prefix}.{i}.ff_norm.weight']


@INPUT_MODELS.register_module(name='molmo')
class MolmoModel(LlamaModel):

Reader = MolmoReader

def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
super().__init__(model_path, tokenizer_path, **kwargs)
config_path = osp.join(self.model_path, 'config.json')
with open(config_path) as f:
self.config = json.load(f)

def tokenizer_info(self):

n_words = 152064
bos_id = 151643
eos_id = 151643
return n_words, bos_id, eos_id

def model_info(self):
config = self.config
num_layer = config['num_hidden_layers']
norm_eps = config['layer_norm_eps']
attn_head_num = config['num_attention_heads']
kv_head_num = config['num_key_value_heads']
hidden_units = config['hidden_size']
rope_theta = config['rope_theta']
max_position_embeddings = config['max_position_embeddings']
vocab_size = config['vocab_size']
# https://huggingface.co/allenai/Molmo-7B-D-0924/blob/main/modeling_molmo.py#L2041
additional_vocab_size = 128
inter_size = config['intermediate_size'] // 2
attn_bias = config['qkv_bias']
return dict(
num_layer=num_layer,
norm_eps=norm_eps,
head_num=attn_head_num,
kv_head_num=kv_head_num,
hidden_units=hidden_units,
attn_bias=int(attn_bias),
inter_size=inter_size,
vocab_size=vocab_size,
# https://huggingface.co/allenai/Molmo-7B-D-0924/blob/main/modeling_molmo.py#L564
embedding_size=vocab_size + additional_vocab_size,
rope_theta=rope_theta,
max_position_embeddings=max_position_embeddings,
)
3 changes: 3 additions & 0 deletions lmdeploy/turbomind/deploy/target_model/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,9 @@ def update_model_config(self):
final_cfg = config_to_dict(self.model_config)
final_cfg.update(dict(start_id=bos_id, end_id=eos_id))
final_cfg.update(self.input_model_info)
if 'embedding_size' not in self.input_model_info.keys():
final_cfg.update(
embedding_size=self.input_model_info['vocab_size'])

self.model_config = config_from_dict(ModelConfig, final_cfg)

Expand Down
4 changes: 3 additions & 1 deletion lmdeploy/turbomind/supported_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,9 @@
ChatGLMModel='glm4',
ChatGLMForConditionalGeneration='glm4',
# mixtral
MixtralForCausalLM='mixtral')
MixtralForCausalLM='mixtral',
MolmoForCausalLM='molmo',
)


def is_supported(model_path: str):
Expand Down
10 changes: 9 additions & 1 deletion lmdeploy/vl/model/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from .mini_gemeni import MiniGeminiVisionModel # noqa F401
from .minicpmv import MiniCPMVModel # noqa F401
from .mllama import MllamaVLModel # noqa F401
from .molmo import MolmoVisionModel # noqa F401
from .phi3_vision import Phi3VisionModel # noqa F401
from .qwen import QwenVisionModel # noqa F401
from .qwen2 import Qwen2VLModel # noqa F401
Expand All @@ -31,7 +32,14 @@ def load_vl_model(model_path: str,
with_llm: bool = False,
backend_config: Optional[Union[TurbomindEngineConfig,
PytorchEngineConfig]] = None):
"""load visual model."""
"""load visual model.

Args:
model_path(str): the path or repo_id from model hub of the model
with_llm(bool): whether to remove the LLM part from the model.
When it is False, it means removing LLM part
backend_config: the config of the inference engine
"""
if not os.path.exists(model_path):
revision = getattr(backend_config, 'revision', None)
download_dir = getattr(backend_config, 'download_dir', None)
Expand Down
Loading
Loading