Skip to content

Commit

Permalink
feat: support llava_qwen2 for fp16 and awq
Browse files Browse the repository at this point in the history
  • Loading branch information
deepindeed2022 committed Nov 4, 2024
1 parent a417a87 commit 5a212d0
Show file tree
Hide file tree
Showing 5 changed files with 134 additions and 0 deletions.
19 changes: 19 additions & 0 deletions examples/python/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
## Support LLava-Interleave-Qwen-7B-hf

### AWQ

lmdeploy lite auto_awq --work_dir models/llava-interleave-qwen-7b-hf/awq models/llava-interleave-qwen-7b-hf
lmdeploy serve api_server models/llava-interleave-qwen-7b-hf/awq --model-format awq


### Offline

python3 offline_vl.py models/llava-interleave-qwen-7b-hf

python3 offline_vl.py models/llava-interleave-qwen-7b-hf/awq --model-format awq

```text
Response(text="The image is a photograph capturing a moment between a person and a dog on a sandy beach. The person is seated on the sand, wearing a plaid shirt and pants, with their legs crossed. They are holding a small object in their hand, which appears to be a toy or a small treat, and are extending their hand towards the dog. The dog, which is standing on the sand, has its front paws raised towards the person's hand, suggesting an interaction or a gesture of play or gratitude. The dog is wearing a colorful harness with a pattern that includes blue, red, and yellow colors. The background features a calm sea with gentle waves lapping at the shore, and the sky is clear with a soft gradient from light to darker blue, indicating either sunrise or sunset. The lighting in the photograph is warm, contributing to the serene atmosphere of the scene. There are no visible texts or brands in the image.", generate_token_len=187, input_token_len=753, session_id=0, finish_reason='stop', token_ids=[785, 2168, 374, 264, 10300, 39780, 264, 4445, 1948, 264, 1697, 323, 264, 5562, 389, 264, 67439, 11321, 13, 576, 1697, 374, 46313, 389, 279, 9278, 11, 12233, 264, 625, 3779, 15478, 323, 24549, 11, 448, 862, 14201, 27031, 13, 2379, 525, 9963, 264, 2613, 1633, 304, 862, 1424, 11, 892, 7952, 311, 387, 264, 21357, 476, 264, 2613, 4228, 11, 323, 525, 32359, 862, 1424, 6974, 279, 5562, 13, 576, 5562, 11, 892, 374, 11259, 389, 279, 9278, 11, 702, 1181, 4065, 281, 8635, 9226, 6974, 279, 1697, 594, 1424, 11, 22561, 458, 16230, 476, 264, 30157, 315, 1486, 476, 45035, 13, 576, 5562, 374, 12233, 264, 33866, 32408, 448, 264, 5383, 429, 5646, 6303, 11, 2518, 11, 323, 13753, 7987, 13, 576, 4004, 4419, 264, 19300, 9396, 448, 21700, 16876, 326, 3629, 518, 279, 30184, 11, 323, 279, 12884, 374, 2797, 448, 264, 8413, 20169, 504, 3100, 311, 39030, 6303, 11, 18860, 2987, 63819, 476, 42984, 13, 576, 17716, 304, 279, 10300, 374, 8205, 11, 28720, 311, 279, 94763, 16566, 315, 279, 6109, 13, 2619, 525, 902, 9434, 21984, 476, 15721, 304, 279, 2168, 13], logprobs=None, index=0)
prompt:How many people in the image?
Response(text='There is one person in the image.', generate_token_len=8, input_token_len=756, session_id=1, finish_reason='stop', token_ids=[3862, 374, 825, 1697, 304, 279, 2168, 13], logprobs=None, index=0)
```
32 changes: 32 additions & 0 deletions examples/python/offline_vl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from lmdeploy import pipeline, TurbomindEngineConfig, GenerationConfig
from lmdeploy.vl import load_image
import argparse

if __name__ == "__main__":
parser = argparse.ArgumentParser(description='test model')
parser.add_argument('model_path',
type=str,
help='the path of the model in localhost or '
'the repo_id of the model in huggingface.co',
default="llava-hf/llava-interleave-qwen-7b-hf")
parser.add_argument(
'--model-format',
type=str,
help='model format',
default='hf',
choices=['hf', 'awq'])
parser.add_argument(
'--max-new-tokens',
type=int,
help='output max tokens number',
default=128)
args = parser.parse_args()
pipe = pipeline(args.model_path,
backend_config=TurbomindEngineConfig(cache_max_entry_count=0.5, model_format=args.model_format),
gen_config=GenerationConfig(max_new_tokens=args.max_new_tokens))

image = load_image('https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg')
for prompt in ["Describe the image.", "How many people in the image?"]:
print(f"prompt:{prompt}")
response = pipe((prompt, image))
print(response)
1 change: 1 addition & 0 deletions lmdeploy/turbomind/deploy/source_model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@
from .mixtral import MixtralModel # noqa: F401
from .qwen import QwenModel # noqa: F401
from .xcomposer2 import Xcomposer2Model # noqa: F401
from .llava_qwen2 import LlavaQwen2Model # noqa: F401
78 changes: 78 additions & 0 deletions lmdeploy/turbomind/deploy/source_model/llava_qwen2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import json
import os.path as osp

from .base import INPUT_MODELS
from .llama import LlamaReader, LlamaModel


class LlavaQwen2Reader(LlamaReader):
"""LlavaQwen2Reader for llama model."""

attn_layer_prefix = 'language_model.model.layers'
attn_layer_patten = r'language_model.model.layers.([0-9]+).'
tok_embeddings_key = 'language_model.model.embed_tokens.weight'
norm_weight_key = 'language_model.model.norm.weight'
output_weight_key = 'language_model.lm_head.weight'

def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
model_cfg: dict, policy):
model_cfg = model_cfg.get('text_config')
super().__init__(new_params, unused_params, last_bin, model_cfg, policy)


@INPUT_MODELS.register_module(name='llava_qwen2')
class LlavaQwen2Model(LlamaModel):
"""LlavaQwen2Model model in hf format."""

def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
super().__init__(model_path, tokenizer_path, **kwargs)
from transformers import AutoConfig
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
arch = config.text_config.architectures[0]
self.Reader = LlavaQwen2Reader

def model_info(self):
"""Read model info."""
params_path = osp.join(self.model_path, 'config.json')
with open(params_path) as f:
model_arg = json.load(f)['text_config']
num_layer = model_arg.get('num_hidden_layers', 32)
norm_eps = model_arg.get('rms_norm_eps', 1e-6)
attn_head_num = model_arg.get('num_attention_heads', 32)
if 'num_key_value_heads' in model_arg:
kv_head_num = model_arg.get('num_key_value_heads', 32)
else:
kv_head_num = model_arg.get('num_attention_heads', 32)
rope_theta = float(model_arg.get('rope_theta', 10000.0))
max_position_embeddings = int(
model_arg.get('max_position_embeddings', 0))
rope_scaling = model_arg.get('rope_scaling', None)
scaling_factor = 0.0
use_dynamic_ntk = 0

# special for the model: llava-hf/llava-interleave-qwen-7b-hf
hidden_units = model_arg.get('hidden_size', 4096)
vocab_size = model_arg.get('vocab_size', 152000)
intermediate_size = model_arg.get("intermediate_size", 11008)
attn_bias = int(model_arg.get('attn_bias', 1))
use_logn_attn = int(model_arg.get('use_logn_attn', 0))

if isinstance(rope_scaling, dict):
scaling_type = model_arg['rope_scaling'].get('type', '')
scaling_factor = model_arg['rope_scaling'].get('factor', '')
if scaling_type == 'dynamic':
use_dynamic_ntk = 1

return dict(num_layer=num_layer,
norm_eps=norm_eps,
head_num=attn_head_num,
hidden_units=hidden_units,
kv_head_num=kv_head_num,
rope_theta=rope_theta,
max_position_embeddings=max_position_embeddings,
use_dynamic_ntk=use_dynamic_ntk,
rope_scaling_factor=scaling_factor,
inter_size=intermediate_size,
use_logn_attn = use_logn_attn,
attn_bias=attn_bias,
vocab_size=vocab_size)
4 changes: 4 additions & 0 deletions lmdeploy/turbomind/supported_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
# llava
LlavaLlamaForCausalLM='llama',
LlavaMistralForCausalLM='llama',
# Llava_interleave
LlavaForConditionalGeneration="llava_qwen2",
# xcomposer2
InternLMXComposer2ForCausalLM='xcomposer2',
# internvl
Expand Down Expand Up @@ -99,5 +101,7 @@ def _is_head_dim_128(cfg):
elif arch == 'InternVLChatModel':
# internvl2-4b,internlm2-1b are not working yet
support_by_turbomind = _is_head_dim_128(cfg.llm_config)
elif arch == 'LlavaForConditionalGeneration':
support_by_turbomind = _is_head_dim_128(cfg.text_config)

return support_by_turbomind

0 comments on commit 5a212d0

Please sign in to comment.