From 5a212d026d842773db4d973c0e010d67a2e0f7c2 Mon Sep 17 00:00:00 2001
From: Willow <deepindeed2022@gmail.com>
Date: Mon, 4 Nov 2024 10:28:15 +0000
Subject: [PATCH] feat: support llava_qwen2 for fp16 and awq

---
 examples/python/README.md                     | 19 +++++
 examples/python/offline_vl.py                 | 32 ++++++++
 .../turbomind/deploy/source_model/__init__.py |  1 +
 .../deploy/source_model/llava_qwen2.py        | 78 +++++++++++++++++++
 lmdeploy/turbomind/supported_models.py        |  4 +
 5 files changed, 134 insertions(+)
 create mode 100644 examples/python/README.md
 create mode 100644 examples/python/offline_vl.py
 create mode 100644 lmdeploy/turbomind/deploy/source_model/llava_qwen2.py

diff --git a/examples/python/README.md b/examples/python/README.md
new file mode 100644
index 0000000000..95fdfe8cf2
--- /dev/null
+++ b/examples/python/README.md
@@ -0,0 +1,19 @@
+## Support LLava-Interleave-Qwen-7B-hf
+
+### AWQ
+
+lmdeploy lite auto_awq --work_dir models/llava-interleave-qwen-7b-hf/awq models/llava-interleave-qwen-7b-hf
+lmdeploy serve api_server models/llava-interleave-qwen-7b-hf/awq --model-format awq 
+
+
+### Offline
+
+python3 offline_vl.py models/llava-interleave-qwen-7b-hf
+
+python3 offline_vl.py models/llava-interleave-qwen-7b-hf/awq --model-format awq
+
+```text
+Response(text="The image is a photograph capturing a moment between a person and a dog on a sandy beach. The person is seated on the sand, wearing a plaid shirt and pants, with their legs crossed. They are holding a small object in their hand, which appears to be a toy or a small treat, and are extending their hand towards the dog. The dog, which is standing on the sand, has its front paws raised towards the person's hand, suggesting an interaction or a gesture of play or gratitude. The dog is wearing a colorful harness with a pattern that includes blue, red, and yellow colors. The background features a calm sea with gentle waves lapping at the shore, and the sky is clear with a soft gradient from light to darker blue, indicating either sunrise or sunset. The lighting in the photograph is warm, contributing to the serene atmosphere of the scene. There are no visible texts or brands in the image.", generate_token_len=187, input_token_len=753, session_id=0, finish_reason='stop', token_ids=[785, 2168, 374, 264, 10300, 39780, 264, 4445, 1948, 264, 1697, 323, 264, 5562, 389, 264, 67439, 11321, 13, 576, 1697, 374, 46313, 389, 279, 9278, 11, 12233, 264, 625, 3779, 15478, 323, 24549, 11, 448, 862, 14201, 27031, 13, 2379, 525, 9963, 264, 2613, 1633, 304, 862, 1424, 11, 892, 7952, 311, 387, 264, 21357, 476, 264, 2613, 4228, 11, 323, 525, 32359, 862, 1424, 6974, 279, 5562, 13, 576, 5562, 11, 892, 374, 11259, 389, 279, 9278, 11, 702, 1181, 4065, 281, 8635, 9226, 6974, 279, 1697, 594, 1424, 11, 22561, 458, 16230, 476, 264, 30157, 315, 1486, 476, 45035, 13, 576, 5562, 374, 12233, 264, 33866, 32408, 448, 264, 5383, 429, 5646, 6303, 11, 2518, 11, 323, 13753, 7987, 13, 576, 4004, 4419, 264, 19300, 9396, 448, 21700, 16876, 326, 3629, 518, 279, 30184, 11, 323, 279, 12884, 374, 2797, 448, 264, 8413, 20169, 504, 3100, 311, 39030, 6303, 11, 18860, 2987, 63819, 476, 42984, 13, 576, 17716, 304, 279, 10300, 374, 8205, 11, 28720, 311, 279, 94763, 16566, 315, 279, 6109, 13, 2619, 525, 902, 9434, 21984, 476, 15721, 304, 279, 2168, 13], logprobs=None, index=0)
+prompt:How many people in the image?
+Response(text='There is one person in the image.', generate_token_len=8, input_token_len=756, session_id=1, finish_reason='stop', token_ids=[3862, 374, 825, 1697, 304, 279, 2168, 13], logprobs=None, index=0)
+```
\ No newline at end of file
diff --git a/examples/python/offline_vl.py b/examples/python/offline_vl.py
new file mode 100644
index 0000000000..54d881e013
--- /dev/null
+++ b/examples/python/offline_vl.py
@@ -0,0 +1,32 @@
+from lmdeploy import pipeline, TurbomindEngineConfig, GenerationConfig
+from lmdeploy.vl import load_image
+import argparse
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='test model')
+    parser.add_argument('model_path',
+                        type=str,
+                        help='the path of the model in localhost or '
+                        'the repo_id of the model in huggingface.co',
+                        default="llava-hf/llava-interleave-qwen-7b-hf")
+    parser.add_argument(
+        '--model-format',
+        type=str,
+        help='model format',
+        default='hf',
+        choices=['hf', 'awq'])
+    parser.add_argument(
+        '--max-new-tokens',
+        type=int,
+        help='output max tokens number',
+        default=128)
+    args = parser.parse_args()
+    pipe = pipeline(args.model_path, 
+                    backend_config=TurbomindEngineConfig(cache_max_entry_count=0.5, model_format=args.model_format), 
+                    gen_config=GenerationConfig(max_new_tokens=args.max_new_tokens))
+
+    image = load_image('https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg')
+    for prompt in ["Describe the image.", "How many people in the image?"]:
+        print(f"prompt:{prompt}")
+        response = pipe((prompt, image))
+        print(response)
\ No newline at end of file
diff --git a/lmdeploy/turbomind/deploy/source_model/__init__.py b/lmdeploy/turbomind/deploy/source_model/__init__.py
index a36102e1c6..dba7a706a5 100644
--- a/lmdeploy/turbomind/deploy/source_model/__init__.py
+++ b/lmdeploy/turbomind/deploy/source_model/__init__.py
@@ -10,3 +10,4 @@
 from .mixtral import MixtralModel  # noqa: F401
 from .qwen import QwenModel  # noqa: F401
 from .xcomposer2 import Xcomposer2Model  # noqa: F401
+from .llava_qwen2 import LlavaQwen2Model  # noqa: F401
diff --git a/lmdeploy/turbomind/deploy/source_model/llava_qwen2.py b/lmdeploy/turbomind/deploy/source_model/llava_qwen2.py
new file mode 100644
index 0000000000..16d662cf06
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/source_model/llava_qwen2.py
@@ -0,0 +1,78 @@
+import json
+import os.path as osp
+
+from .base import INPUT_MODELS
+from .llama import LlamaReader, LlamaModel
+
+
+class LlavaQwen2Reader(LlamaReader):
+    """LlavaQwen2Reader for llama model."""
+
+    attn_layer_prefix = 'language_model.model.layers'
+    attn_layer_patten = r'language_model.model.layers.([0-9]+).'
+    tok_embeddings_key = 'language_model.model.embed_tokens.weight'
+    norm_weight_key = 'language_model.model.norm.weight'
+    output_weight_key = 'language_model.lm_head.weight'
+
+    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
+                 model_cfg: dict, policy):
+        model_cfg = model_cfg.get('text_config')
+        super().__init__(new_params, unused_params, last_bin, model_cfg, policy)
+
+
+@INPUT_MODELS.register_module(name='llava_qwen2')
+class LlavaQwen2Model(LlamaModel):
+    """LlavaQwen2Model model in hf format."""
+
+    def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
+        super().__init__(model_path, tokenizer_path, **kwargs)
+        from transformers import AutoConfig
+        config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+        arch = config.text_config.architectures[0]
+        self.Reader = LlavaQwen2Reader
+
+    def model_info(self):
+        """Read model info."""
+        params_path = osp.join(self.model_path, 'config.json')
+        with open(params_path) as f:
+            model_arg = json.load(f)['text_config']
+            num_layer = model_arg.get('num_hidden_layers', 32)
+            norm_eps = model_arg.get('rms_norm_eps', 1e-6)
+            attn_head_num = model_arg.get('num_attention_heads', 32)
+            if 'num_key_value_heads' in model_arg:
+                kv_head_num = model_arg.get('num_key_value_heads', 32)
+            else:
+                kv_head_num = model_arg.get('num_attention_heads', 32)
+            rope_theta = float(model_arg.get('rope_theta', 10000.0))
+            max_position_embeddings = int(
+                model_arg.get('max_position_embeddings', 0))
+            rope_scaling = model_arg.get('rope_scaling', None)
+            scaling_factor = 0.0
+            use_dynamic_ntk = 0
+
+            # special for the model: llava-hf/llava-interleave-qwen-7b-hf
+            hidden_units = model_arg.get('hidden_size', 4096)
+            vocab_size = model_arg.get('vocab_size', 152000)
+            intermediate_size = model_arg.get("intermediate_size", 11008)
+            attn_bias = int(model_arg.get('attn_bias', 1))
+            use_logn_attn = int(model_arg.get('use_logn_attn', 0))
+
+            if isinstance(rope_scaling, dict):
+                scaling_type = model_arg['rope_scaling'].get('type', '')
+                scaling_factor = model_arg['rope_scaling'].get('factor', '')
+                if scaling_type == 'dynamic':
+                    use_dynamic_ntk = 1
+
+        return dict(num_layer=num_layer,
+                    norm_eps=norm_eps,
+                    head_num=attn_head_num,
+                    hidden_units=hidden_units,
+                    kv_head_num=kv_head_num,
+                    rope_theta=rope_theta,
+                    max_position_embeddings=max_position_embeddings,
+                    use_dynamic_ntk=use_dynamic_ntk,
+                    rope_scaling_factor=scaling_factor,
+                    inter_size=intermediate_size,
+                    use_logn_attn = use_logn_attn,
+                    attn_bias=attn_bias,
+                    vocab_size=vocab_size)
\ No newline at end of file
diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py
index 8a1f5e7315..cef4d57457 100644
--- a/lmdeploy/turbomind/supported_models.py
+++ b/lmdeploy/turbomind/supported_models.py
@@ -25,6 +25,8 @@
     # llava
     LlavaLlamaForCausalLM='llama',
     LlavaMistralForCausalLM='llama',
+    # Llava_interleave
+    LlavaForConditionalGeneration="llava_qwen2",
     # xcomposer2
     InternLMXComposer2ForCausalLM='xcomposer2',
     # internvl
@@ -99,5 +101,7 @@ def _is_head_dim_128(cfg):
             elif arch == 'InternVLChatModel':
                 # internvl2-4b,internlm2-1b are not working yet
                 support_by_turbomind = _is_head_dim_128(cfg.llm_config)
+            elif arch == 'LlavaForConditionalGeneration':
+                support_by_turbomind = _is_head_dim_128(cfg.text_config)
 
     return support_by_turbomind