Skip to content

Commit

Permalink
MoE support for turbomind (#2621)
Browse files Browse the repository at this point in the history
* initial moe support

* dynamic grouped gemm

* benchmark

* moe benchmark

* moe sampling

* split-k

* refactor tuning

* simplify

* n-major weight

* add `num` for `MatrixLayout`

* packed rows

* packed cols

* dispatch for packed rows

* w4a16 moe

* refactor model loading

* fix pytorch loader

* refactor

* dispatch w4a16 moe

* fix loader

* add comment

* fix msvc build

* fix msvc build

* fix msvc build

* fix ut

* fix ut

* fix p-lora

* add all support arches

* minor

* fix lint

* fix lint

* fix lint

* fix ut

* bf16 support

* minor

* refactor

* fix lint

* fix ut

* minor

* minor

* minor

* fix inter_size config

* load with non-standard filenames

* fix loader

* fix missing default param

* defer the loading of misc weights for safetensors

* fix conversion

* fix deepseek-vl

* verify model config

* pad inter size by group size and tp

* fix minicpm attn bias & ignore un-needed bias

* set `attn_bias` based on minicpm version
  • Loading branch information
lzhangzz authored Oct 25, 2024
1 parent 1a76efb commit 962e760
Show file tree
Hide file tree
Showing 105 changed files with 5,703 additions and 1,772 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ repos:
rev: v2.1.0
hooks:
- id: codespell
args: ["--skip=third_party/*,*.ipynb,*.proto,src/turbomind/kernels/gemm/transform.h,docker/Dockerfile_aarch64_ascend,docs/en/get_started/ascend/get_started.md,docs/zh_cn/get_started/ascend/get_started.md"]
args: ["--skip=third_party/*,*.ipynb,*.proto,src/turbomind/*,docker/Dockerfile_aarch64_ascend,docs/en/get_started/ascend/get_started.md,docs/zh_cn/get_started/ascend/get_started.md"]


- repo: https://github.com/myint/docformatter
Expand Down
12 changes: 11 additions & 1 deletion lmdeploy/turbomind/deploy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class ModelConfig:
num_layer: int = None
inter_size: int = None
norm_eps: float = None
attn_bias: int = None
attn_bias: int = 0
start_id: int = None
end_id: int = None
size_per_head: int = 128
Expand All @@ -47,6 +47,16 @@ class ModelConfig:
session_len: int = None
tp: int = 1
model_format: str = 'hf'
expert_num: int = 0
expert_inter_size: int = 0
experts_per_token: int = 0

def verify(self):
invalid = {}
for k, v in self.__dict__.items():
if v is None:
invalid[k] = v
assert not invalid, f'incomplete model config: {invalid}'


@dataclass
Expand Down
14 changes: 4 additions & 10 deletions lmdeploy/turbomind/deploy/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from ...utils import _get_and_verify_max_len, is_bf16_supported
from ..supported_models import SUPPORTED_ARCHS, is_supported
from .config import TurbomindModelConfig
from .exporter import get_exporter_factory
from .module import Transformer
from .policy import get_input_policy
from .source_model.base import INPUT_MODELS
from .target_model.base import OUTPUT_MODELS
Expand Down Expand Up @@ -99,7 +99,6 @@ def get_output_model_registered_name_and_config(model_path: str,
group_size (int): the size of group used by awq model
"""
register_name = 'tm'
turbomind_model_arch = 'llama'
weight_type = 'float16'

config = TurbomindModelConfig.from_dict()
Expand All @@ -108,7 +107,6 @@ def get_output_model_registered_name_and_config(model_path: str,
session_len = 2048
else: # hf, awq, None
model_arch, model_config = get_model_arch(model_path)
turbomind_model_arch = SUPPORTED_ARCHS[model_arch]
session_len = _get_and_verify_max_len(model_config, None)
if model_format in ['awq', 'gptq']:
weight_type = 'int4'
Expand Down Expand Up @@ -148,11 +146,7 @@ def get_output_model_registered_name_and_config(model_path: str,
config.model_config.group_size = group_size
config.model_config.session_len = session_len

lora_type = 'plora' if turbomind_model_arch == 'xcomposer2' else ''

exporter_factory = get_exporter_factory(weight_type, lora_type)

return register_name, config, exporter_factory
return register_name, config


def pack_model_repository(workspace_path: str):
Expand Down Expand Up @@ -264,7 +258,7 @@ def get_tm_model(model_path,
tokenizer_path=model_path,
input_policy=input_policy)

output_model_name, tm_cfg, exporter_factory = \
output_model_name, tm_cfg = \
get_output_model_registered_name_and_config(
model_path=model_path,
model_format=engine_config.model_format,
Expand All @@ -278,7 +272,7 @@ def get_tm_model(model_path,
output_model = OUTPUT_MODELS.get(output_model_name)(
input_model=input_model,
cfg=tm_cfg,
exporter_factory=exporter_factory,
model_cls=Transformer,
out_dir=out_dir)

return output_model
Expand Down
211 changes: 0 additions & 211 deletions lmdeploy/turbomind/deploy/exporter.py

This file was deleted.

Loading

0 comments on commit 962e760

Please sign in to comment.