Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature]: support LlavaForConditionalGeneration with turbomind inference #2710

Merged
merged 9 commits into from
Nov 8, 2024
15 changes: 15 additions & 0 deletions examples/python/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
## Support LLava-Interleave-Qwen-7B-hf
deepindeed2022 marked this conversation as resolved.
Show resolved Hide resolved

### generate gemm config (Optional)

`python3 lmdeploy/turbomind/generate_gemm_config.py --tensor-para-size 1 --max-batch-size 4 --model-path /models/llava-interleave-qwen-7b-hf`

### generate awq format model(Optional for awq format)

`lmdeploy lite auto_awq --work_dir models/llava-interleave-qwen-7b-hf/awq models/llava-interleave-qwen-7b-hf`

### start server

`python3 offline_vl.py models/llava-interleave-qwen-7b-hf`

`python3 offline_vl.py models/llava-interleave-qwen-7b-hf/awq --model-format awq`
34 changes: 34 additions & 0 deletions examples/python/offline_vl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import argparse
deepindeed2022 marked this conversation as resolved.
Show resolved Hide resolved

from lmdeploy import GenerationConfig, TurbomindEngineConfig, pipeline
from lmdeploy.vl import load_image

if __name__ == '__main__':
parser = argparse.ArgumentParser(description='test model')
parser.add_argument('model_path',
type=str,
help='the path of the model in localhost or '
'the repo_id of the model in huggingface.co',
default='llava-hf/llava-interleave-qwen-7b-hf')
parser.add_argument('--model-format',
type=str,
help='model format',
default='hf',
choices=['hf', 'awq'])
parser.add_argument('--max-new-tokens',
type=int,
help='output max tokens number',
default=128)
args = parser.parse_args()
pipe = pipeline(
args.model_path,
backend_config=TurbomindEngineConfig(cache_max_entry_count=0.5,
model_format=args.model_format),
gen_config=GenerationConfig(max_new_tokens=args.max_new_tokens))

image = load_image('https://qianwen-res.oss-cn-beijing.aliyuncs.com/' +
'Qwen-VL/assets/demo.jpeg')
for prompt in ['Describe the image.', 'How many people in the image?']:
print(f'prompt:{prompt}')
response = pipe((prompt, image))
print(response)
1 change: 1 addition & 0 deletions lmdeploy/turbomind/deploy/source_model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from .internlm2 import InternLM2Model # noqa: F401
from .internvl import InternVLModel # noqa: F401
from .llama import LlamaModel # noqa: F401
from .llava_qwen2 import LlavaQwen2Model # noqa: F401
from .meta_llama import MetaLlamaModel # noqa: F401
from .minicpmv import MiniCPMVModel # noqa: F401
from .mixtral import MixtralModel # noqa: F401
Expand Down
77 changes: 77 additions & 0 deletions lmdeploy/turbomind/deploy/source_model/llava_qwen2.py
deepindeed2022 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# Copyright (c) OpenMMLab. All rights reserved.
import json
import os.path as osp

from .base import INPUT_MODELS
from .llama import LlamaModel, LlamaReader


class LlavaQwen2Reader(LlamaReader):
"""LlavaQwen2Reader for llama model."""

attn_layer_prefix = 'language_model.model.layers'
attn_layer_patten = r'language_model.model.layers.([0-9]+).'
tok_embeddings_key = 'language_model.model.embed_tokens.weight'
norm_weight_key = 'language_model.model.norm.weight'
output_weight_key = 'language_model.lm_head.weight'

def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
model_cfg: dict, policy):
model_cfg = model_cfg.get('text_config')
super().__init__(new_params, unused_params, last_bin, model_cfg,
policy)


@INPUT_MODELS.register_module(name='llava_qwen2')
class LlavaQwen2Model(LlamaModel):
"""LlavaQwen2Model model in hf format."""

def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
super().__init__(model_path, tokenizer_path, **kwargs)
self.Reader = LlavaQwen2Reader

def model_info(self):
"""Read model info."""
params_path = osp.join(self.model_path, 'config.json')
with open(params_path) as f:
model_arg = json.load(f)['text_config']
num_layer = model_arg.get('num_hidden_layers', 32)
norm_eps = model_arg.get('rms_norm_eps', 1e-6)
attn_head_num = model_arg.get('num_attention_heads', 32)
if 'num_key_value_heads' in model_arg:
kv_head_num = model_arg.get('num_key_value_heads', 32)
else:
kv_head_num = model_arg.get('num_attention_heads', 32)
rope_theta = float(model_arg.get('rope_theta', 10000.0))
max_position_embeddings = int(
model_arg.get('max_position_embeddings', 0))
rope_scaling = model_arg.get('rope_scaling', None)
scaling_factor = 0.0
use_dynamic_ntk = 0

# special for the model: llava-hf/llava-interleave-qwen-7b-hf
hidden_units = model_arg.get('hidden_size', 4096)
vocab_size = model_arg.get('vocab_size', 152000)
intermediate_size = model_arg.get('intermediate_size', 11008)
attn_bias = int(model_arg.get('attn_bias', 1))
use_logn_attn = int(model_arg.get('use_logn_attn', 0))

if isinstance(rope_scaling, dict):
scaling_type = model_arg['rope_scaling'].get('type', '')
scaling_factor = model_arg['rope_scaling'].get('factor', '')
if scaling_type == 'dynamic':
use_dynamic_ntk = 1

return dict(num_layer=num_layer,
norm_eps=norm_eps,
head_num=attn_head_num,
hidden_units=hidden_units,
kv_head_num=kv_head_num,
rope_theta=rope_theta,
max_position_embeddings=max_position_embeddings,
use_dynamic_ntk=use_dynamic_ntk,
rope_scaling_factor=scaling_factor,
inter_size=intermediate_size,
use_logn_attn=use_logn_attn,
attn_bias=attn_bias,
vocab_size=vocab_size)
21 changes: 17 additions & 4 deletions lmdeploy/turbomind/generate_gemm_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,23 @@ def main(head_num: int = 32,
from transformers import AutoConfig
config = AutoConfig.from_pretrained(model_path,
trust_remote_code=True)
head_num = config.num_attention_heads
size_per_head = config.hidden_size // head_num
inter_size = config.intermediate_size
vocab_size = config.vocab_size
try:
head_num = config.num_attention_heads
size_per_head = config.hidden_size // head_num
inter_size = config.intermediate_size
vocab_size = config.vocab_size
except AttributeError as e:
if hasattr(config, 'text_config'):
config = config.text_config
elif hasattr(config, 'llm_config'):
config = config.llm_config
deepindeed2022 marked this conversation as resolved.
Show resolved Hide resolved
else:
raise AttributeError(f'not found attribute in {config},\
please check your model config file.{e}')
head_num = config.num_attention_heads
size_per_head = config.hidden_size // head_num
inter_size = config.intermediate_size
vocab_size = config.vocab_size
for bsz in range(1, max_batch_size + 1):
subprocess.call(
f'{get_llama_gemm()} {bsz} 1 1 {head_num} {size_per_head}'
Expand Down
4 changes: 4 additions & 0 deletions lmdeploy/turbomind/supported_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
# llava
LlavaLlamaForCausalLM='llama',
LlavaMistralForCausalLM='llama',
# Llava_interleave
LlavaForConditionalGeneration='llava_qwen2',
# xcomposer2
InternLMXComposer2ForCausalLM='xcomposer2',
# internvl
Expand Down Expand Up @@ -99,5 +101,7 @@ def _is_head_dim_128(cfg):
elif arch == 'InternVLChatModel':
# internvl2-4b,internlm2-1b are not working yet
support_by_turbomind = _is_head_dim_128(cfg.llm_config)
elif arch == 'LlavaForConditionalGeneration':
support_by_turbomind = _is_head_dim_128(cfg.text_config)
deepindeed2022 marked this conversation as resolved.
Show resolved Hide resolved

return support_by_turbomind
Loading