InternLM · lvhan028 · Nov 8, 2024 · Nov 4, 2024 · Nov 5, 2024 · Nov 5, 2024
diff --git a/examples/python/README.md b/examples/python/README.md
@@ -0,0 +1,15 @@
+## Support LLava-Interleave-Qwen-7B-hf
+
+### generate gemm config (Optional)
+
+`python3 lmdeploy/turbomind/generate_gemm_config.py --tensor-para-size 1 --max-batch-size 4 --model-path /models/llava-interleave-qwen-7b-hf`
+
+### generate awq format model(Optional for awq format)
+
+`lmdeploy lite auto_awq --work_dir models/llava-interleave-qwen-7b-hf/awq models/llava-interleave-qwen-7b-hf`
+
+### start server
+
+`python3 offline_vl.py models/llava-interleave-qwen-7b-hf`
+
+`python3 offline_vl.py models/llava-interleave-qwen-7b-hf/awq --model-format awq`
diff --git a/examples/python/offline_vl.py b/examples/python/offline_vl.py
@@ -0,0 +1,34 @@
+import argparse
+
+from lmdeploy import GenerationConfig, TurbomindEngineConfig, pipeline
+from lmdeploy.vl import load_image
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='test model')
+    parser.add_argument('model_path',
+                        type=str,
+                        help='the path of the model in localhost or '
+                        'the repo_id of the model in huggingface.co',
+                        default='llava-hf/llava-interleave-qwen-7b-hf')
+    parser.add_argument('--model-format',
+                        type=str,
+                        help='model format',
+                        default='hf',
+                        choices=['hf', 'awq'])
+    parser.add_argument('--max-new-tokens',
+                        type=int,
+                        help='output max tokens number',
+                        default=128)
+    args = parser.parse_args()
+    pipe = pipeline(
+        args.model_path,
+        backend_config=TurbomindEngineConfig(cache_max_entry_count=0.5,
+                                             model_format=args.model_format),
+        gen_config=GenerationConfig(max_new_tokens=args.max_new_tokens))
+
+    image = load_image('https://qianwen-res.oss-cn-beijing.aliyuncs.com/' +
+                       'Qwen-VL/assets/demo.jpeg')
+    for prompt in ['Describe the image.', 'How many people in the image?']:
+        print(f'prompt:{prompt}')
+        response = pipe((prompt, image))
+        print(response)
diff --git a/lmdeploy/turbomind/deploy/source_model/__init__.py b/lmdeploy/turbomind/deploy/source_model/__init__.py
@@ -5,6 +5,7 @@
 from .internlm2 import InternLM2Model  # noqa: F401
 from .internvl import InternVLModel  # noqa: F401
 from .llama import LlamaModel  # noqa: F401
+from .llava_qwen2 import LlavaQwen2Model  # noqa: F401
 from .meta_llama import MetaLlamaModel  # noqa: F401
 from .minicpmv import MiniCPMVModel  # noqa: F401
 from .mixtral import MixtralModel  # noqa: F401

diff --git a/lmdeploy/turbomind/deploy/source_model/llava_qwen2.py b/lmdeploy/turbomind/deploy/source_model/llava_qwen2.py
@@ -0,0 +1,77 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+
+from .base import INPUT_MODELS
+from .llama import LlamaModel, LlamaReader
+
+
+class LlavaQwen2Reader(LlamaReader):
+    """LlavaQwen2Reader for llama model."""
+
+    attn_layer_prefix = 'language_model.model.layers'
+    attn_layer_patten = r'language_model.model.layers.([0-9]+).'
+    tok_embeddings_key = 'language_model.model.embed_tokens.weight'
+    norm_weight_key = 'language_model.model.norm.weight'
+    output_weight_key = 'language_model.lm_head.weight'
+
+    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
+                 model_cfg: dict, policy):
+        model_cfg = model_cfg.get('text_config')
+        super().__init__(new_params, unused_params, last_bin, model_cfg,
+                         policy)
+
+
+@INPUT_MODELS.register_module(name='llava_qwen2')
+class LlavaQwen2Model(LlamaModel):
+    """LlavaQwen2Model model in hf format."""
+
+    def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
+        super().__init__(model_path, tokenizer_path, **kwargs)
+        self.Reader = LlavaQwen2Reader
+
+    def model_info(self):
+        """Read model info."""
+        params_path = osp.join(self.model_path, 'config.json')
+        with open(params_path) as f:
+            model_arg = json.load(f)['text_config']
+            num_layer = model_arg.get('num_hidden_layers', 32)
+            norm_eps = model_arg.get('rms_norm_eps', 1e-6)
+            attn_head_num = model_arg.get('num_attention_heads', 32)
+            if 'num_key_value_heads' in model_arg:
+                kv_head_num = model_arg.get('num_key_value_heads', 32)
+            else:
+                kv_head_num = model_arg.get('num_attention_heads', 32)
+            rope_theta = float(model_arg.get('rope_theta', 10000.0))
+            max_position_embeddings = int(
+                model_arg.get('max_position_embeddings', 0))
+            rope_scaling = model_arg.get('rope_scaling', None)
+            scaling_factor = 0.0
+            use_dynamic_ntk = 0
+
+            # special for the model: llava-hf/llava-interleave-qwen-7b-hf
+            hidden_units = model_arg.get('hidden_size', 4096)
+            vocab_size = model_arg.get('vocab_size', 152000)
+            intermediate_size = model_arg.get('intermediate_size', 11008)
+            attn_bias = int(model_arg.get('attn_bias', 1))
+            use_logn_attn = int(model_arg.get('use_logn_attn', 0))
+
+            if isinstance(rope_scaling, dict):
+                scaling_type = model_arg['rope_scaling'].get('type', '')
+                scaling_factor = model_arg['rope_scaling'].get('factor', '')
+                if scaling_type == 'dynamic':
+                    use_dynamic_ntk = 1
+
+        return dict(num_layer=num_layer,
+                    norm_eps=norm_eps,
+                    head_num=attn_head_num,
+                    hidden_units=hidden_units,
+                    kv_head_num=kv_head_num,
+                    rope_theta=rope_theta,
+                    max_position_embeddings=max_position_embeddings,
+                    use_dynamic_ntk=use_dynamic_ntk,
+                    rope_scaling_factor=scaling_factor,
+                    inter_size=intermediate_size,
+                    use_logn_attn=use_logn_attn,
+                    attn_bias=attn_bias,
+                    vocab_size=vocab_size)
diff --git a/lmdeploy/turbomind/generate_gemm_config.py b/lmdeploy/turbomind/generate_gemm_config.py
@@ -54,10 +54,23 @@ def main(head_num: int = 32,
             from transformers import AutoConfig
             config = AutoConfig.from_pretrained(model_path,
                                                 trust_remote_code=True)
-            head_num = config.num_attention_heads
-            size_per_head = config.hidden_size // head_num
-            inter_size = config.intermediate_size
-            vocab_size = config.vocab_size
+            try:
+                head_num = config.num_attention_heads
+                size_per_head = config.hidden_size // head_num
+                inter_size = config.intermediate_size
+                vocab_size = config.vocab_size
+            except AttributeError as e:
+                if hasattr(config, 'text_config'):
+                    config = config.text_config
+                elif hasattr(config, 'llm_config'):
+                    config = config.llm_config
+                else:
+                    raise AttributeError(f'not found attribute in {config},\
+                            please check your model config file.{e}')
+                head_num = config.num_attention_heads
+                size_per_head = config.hidden_size // head_num
+                inter_size = config.intermediate_size
+                vocab_size = config.vocab_size
     for bsz in range(1, max_batch_size + 1):
         subprocess.call(
             f'{get_llama_gemm()} {bsz} 1 1 {head_num} {size_per_head}'

diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py
@@ -25,6 +25,8 @@
     # llava
     LlavaLlamaForCausalLM='llama',
     LlavaMistralForCausalLM='llama',
+    # Llava_interleave
+    LlavaForConditionalGeneration='llava_qwen2',
     # xcomposer2
     InternLMXComposer2ForCausalLM='xcomposer2',
     # internvl
@@ -99,5 +101,7 @@ def _is_head_dim_128(cfg):
             elif arch == 'InternVLChatModel':
                 # internvl2-4b,internlm2-1b are not working yet
                 support_by_turbomind = _is_head_dim_128(cfg.llm_config)
+            elif arch == 'LlavaForConditionalGeneration':
+                support_by_turbomind = _is_head_dim_128(cfg.text_config)
 
     return support_by_turbomind