From 5a212d026d842773db4d973c0e010d67a2e0f7c2 Mon Sep 17 00:00:00 2001
From: Willow <deepindeed2022@gmail.com>
Date: Mon, 4 Nov 2024 10:28:15 +0000
Subject: [PATCH 1/8] feat: support llava_qwen2 for fp16 and awq

---
 examples/python/README.md                     | 19 +++++
 examples/python/offline_vl.py                 | 32 ++++++++
 .../turbomind/deploy/source_model/__init__.py |  1 +
 .../deploy/source_model/llava_qwen2.py        | 78 +++++++++++++++++++
 lmdeploy/turbomind/supported_models.py        |  4 +
 5 files changed, 134 insertions(+)
 create mode 100644 examples/python/README.md
 create mode 100644 examples/python/offline_vl.py
 create mode 100644 lmdeploy/turbomind/deploy/source_model/llava_qwen2.py

diff --git a/examples/python/README.md b/examples/python/README.md
new file mode 100644
index 0000000000..95fdfe8cf2
--- /dev/null
+++ b/examples/python/README.md
@@ -0,0 +1,19 @@
+## Support LLava-Interleave-Qwen-7B-hf
+
+### AWQ
+
+lmdeploy lite auto_awq --work_dir models/llava-interleave-qwen-7b-hf/awq models/llava-interleave-qwen-7b-hf
+lmdeploy serve api_server models/llava-interleave-qwen-7b-hf/awq --model-format awq 
+
+
+### Offline
+
+python3 offline_vl.py models/llava-interleave-qwen-7b-hf
+
+python3 offline_vl.py models/llava-interleave-qwen-7b-hf/awq --model-format awq
+
+```text
+Response(text="The image is a photograph capturing a moment between a person and a dog on a sandy beach. The person is seated on the sand, wearing a plaid shirt and pants, with their legs crossed. They are holding a small object in their hand, which appears to be a toy or a small treat, and are extending their hand towards the dog. The dog, which is standing on the sand, has its front paws raised towards the person's hand, suggesting an interaction or a gesture of play or gratitude. The dog is wearing a colorful harness with a pattern that includes blue, red, and yellow colors. The background features a calm sea with gentle waves lapping at the shore, and the sky is clear with a soft gradient from light to darker blue, indicating either sunrise or sunset. The lighting in the photograph is warm, contributing to the serene atmosphere of the scene. There are no visible texts or brands in the image.", generate_token_len=187, input_token_len=753, session_id=0, finish_reason='stop', token_ids=[785, 2168, 374, 264, 10300, 39780, 264, 4445, 1948, 264, 1697, 323, 264, 5562, 389, 264, 67439, 11321, 13, 576, 1697, 374, 46313, 389, 279, 9278, 11, 12233, 264, 625, 3779, 15478, 323, 24549, 11, 448, 862, 14201, 27031, 13, 2379, 525, 9963, 264, 2613, 1633, 304, 862, 1424, 11, 892, 7952, 311, 387, 264, 21357, 476, 264, 2613, 4228, 11, 323, 525, 32359, 862, 1424, 6974, 279, 5562, 13, 576, 5562, 11, 892, 374, 11259, 389, 279, 9278, 11, 702, 1181, 4065, 281, 8635, 9226, 6974, 279, 1697, 594, 1424, 11, 22561, 458, 16230, 476, 264, 30157, 315, 1486, 476, 45035, 13, 576, 5562, 374, 12233, 264, 33866, 32408, 448, 264, 5383, 429, 5646, 6303, 11, 2518, 11, 323, 13753, 7987, 13, 576, 4004, 4419, 264, 19300, 9396, 448, 21700, 16876, 326, 3629, 518, 279, 30184, 11, 323, 279, 12884, 374, 2797, 448, 264, 8413, 20169, 504, 3100, 311, 39030, 6303, 11, 18860, 2987, 63819, 476, 42984, 13, 576, 17716, 304, 279, 10300, 374, 8205, 11, 28720, 311, 279, 94763, 16566, 315, 279, 6109, 13, 2619, 525, 902, 9434, 21984, 476, 15721, 304, 279, 2168, 13], logprobs=None, index=0)
+prompt:How many people in the image?
+Response(text='There is one person in the image.', generate_token_len=8, input_token_len=756, session_id=1, finish_reason='stop', token_ids=[3862, 374, 825, 1697, 304, 279, 2168, 13], logprobs=None, index=0)
+```
\ No newline at end of file
diff --git a/examples/python/offline_vl.py b/examples/python/offline_vl.py
new file mode 100644
index 0000000000..54d881e013
--- /dev/null
+++ b/examples/python/offline_vl.py
@@ -0,0 +1,32 @@
+from lmdeploy import pipeline, TurbomindEngineConfig, GenerationConfig
+from lmdeploy.vl import load_image
+import argparse
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='test model')
+    parser.add_argument('model_path',
+                        type=str,
+                        help='the path of the model in localhost or '
+                        'the repo_id of the model in huggingface.co',
+                        default="llava-hf/llava-interleave-qwen-7b-hf")
+    parser.add_argument(
+        '--model-format',
+        type=str,
+        help='model format',
+        default='hf',
+        choices=['hf', 'awq'])
+    parser.add_argument(
+        '--max-new-tokens',
+        type=int,
+        help='output max tokens number',
+        default=128)
+    args = parser.parse_args()
+    pipe = pipeline(args.model_path, 
+                    backend_config=TurbomindEngineConfig(cache_max_entry_count=0.5, model_format=args.model_format), 
+                    gen_config=GenerationConfig(max_new_tokens=args.max_new_tokens))
+
+    image = load_image('https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg')
+    for prompt in ["Describe the image.", "How many people in the image?"]:
+        print(f"prompt:{prompt}")
+        response = pipe((prompt, image))
+        print(response)
\ No newline at end of file
diff --git a/lmdeploy/turbomind/deploy/source_model/__init__.py b/lmdeploy/turbomind/deploy/source_model/__init__.py
index a36102e1c6..dba7a706a5 100644
--- a/lmdeploy/turbomind/deploy/source_model/__init__.py
+++ b/lmdeploy/turbomind/deploy/source_model/__init__.py
@@ -10,3 +10,4 @@
 from .mixtral import MixtralModel  # noqa: F401
 from .qwen import QwenModel  # noqa: F401
 from .xcomposer2 import Xcomposer2Model  # noqa: F401
+from .llava_qwen2 import LlavaQwen2Model  # noqa: F401
diff --git a/lmdeploy/turbomind/deploy/source_model/llava_qwen2.py b/lmdeploy/turbomind/deploy/source_model/llava_qwen2.py
new file mode 100644
index 0000000000..16d662cf06
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/source_model/llava_qwen2.py
@@ -0,0 +1,78 @@
+import json
+import os.path as osp
+
+from .base import INPUT_MODELS
+from .llama import LlamaReader, LlamaModel
+
+
+class LlavaQwen2Reader(LlamaReader):
+    """LlavaQwen2Reader for llama model."""
+
+    attn_layer_prefix = 'language_model.model.layers'
+    attn_layer_patten = r'language_model.model.layers.([0-9]+).'
+    tok_embeddings_key = 'language_model.model.embed_tokens.weight'
+    norm_weight_key = 'language_model.model.norm.weight'
+    output_weight_key = 'language_model.lm_head.weight'
+
+    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
+                 model_cfg: dict, policy):
+        model_cfg = model_cfg.get('text_config')
+        super().__init__(new_params, unused_params, last_bin, model_cfg, policy)
+
+
+@INPUT_MODELS.register_module(name='llava_qwen2')
+class LlavaQwen2Model(LlamaModel):
+    """LlavaQwen2Model model in hf format."""
+
+    def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
+        super().__init__(model_path, tokenizer_path, **kwargs)
+        from transformers import AutoConfig
+        config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+        arch = config.text_config.architectures[0]
+        self.Reader = LlavaQwen2Reader
+
+    def model_info(self):
+        """Read model info."""
+        params_path = osp.join(self.model_path, 'config.json')
+        with open(params_path) as f:
+            model_arg = json.load(f)['text_config']
+            num_layer = model_arg.get('num_hidden_layers', 32)
+            norm_eps = model_arg.get('rms_norm_eps', 1e-6)
+            attn_head_num = model_arg.get('num_attention_heads', 32)
+            if 'num_key_value_heads' in model_arg:
+                kv_head_num = model_arg.get('num_key_value_heads', 32)
+            else:
+                kv_head_num = model_arg.get('num_attention_heads', 32)
+            rope_theta = float(model_arg.get('rope_theta', 10000.0))
+            max_position_embeddings = int(
+                model_arg.get('max_position_embeddings', 0))
+            rope_scaling = model_arg.get('rope_scaling', None)
+            scaling_factor = 0.0
+            use_dynamic_ntk = 0
+
+            # special for the model: llava-hf/llava-interleave-qwen-7b-hf
+            hidden_units = model_arg.get('hidden_size', 4096)
+            vocab_size = model_arg.get('vocab_size', 152000)
+            intermediate_size = model_arg.get("intermediate_size", 11008)
+            attn_bias = int(model_arg.get('attn_bias', 1))
+            use_logn_attn = int(model_arg.get('use_logn_attn', 0))
+
+            if isinstance(rope_scaling, dict):
+                scaling_type = model_arg['rope_scaling'].get('type', '')
+                scaling_factor = model_arg['rope_scaling'].get('factor', '')
+                if scaling_type == 'dynamic':
+                    use_dynamic_ntk = 1
+
+        return dict(num_layer=num_layer,
+                    norm_eps=norm_eps,
+                    head_num=attn_head_num,
+                    hidden_units=hidden_units,
+                    kv_head_num=kv_head_num,
+                    rope_theta=rope_theta,
+                    max_position_embeddings=max_position_embeddings,
+                    use_dynamic_ntk=use_dynamic_ntk,
+                    rope_scaling_factor=scaling_factor,
+                    inter_size=intermediate_size,
+                    use_logn_attn = use_logn_attn,
+                    attn_bias=attn_bias,
+                    vocab_size=vocab_size)
\ No newline at end of file
diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py
index 8a1f5e7315..cef4d57457 100644
--- a/lmdeploy/turbomind/supported_models.py
+++ b/lmdeploy/turbomind/supported_models.py
@@ -25,6 +25,8 @@
     # llava
     LlavaLlamaForCausalLM='llama',
     LlavaMistralForCausalLM='llama',
+    # Llava_interleave
+    LlavaForConditionalGeneration="llava_qwen2",
     # xcomposer2
     InternLMXComposer2ForCausalLM='xcomposer2',
     # internvl
@@ -99,5 +101,7 @@ def _is_head_dim_128(cfg):
             elif arch == 'InternVLChatModel':
                 # internvl2-4b,internlm2-1b are not working yet
                 support_by_turbomind = _is_head_dim_128(cfg.llm_config)
+            elif arch == 'LlavaForConditionalGeneration':
+                support_by_turbomind = _is_head_dim_128(cfg.text_config)
 
     return support_by_turbomind

From eea842c2a850a0f5d0d1c149fe439cbca6100192 Mon Sep 17 00:00:00 2001
From: Willow <deepindeed2022@gmail.com>
Date: Tue, 5 Nov 2024 02:29:46 +0000
Subject: [PATCH 2/8] update generate gemm config script for VLM

---
 examples/python/README.md                  | 18 +++++++-----------
 lmdeploy/turbomind/generate_gemm_config.py | 20 ++++++++++++++++----
 2 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/examples/python/README.md b/examples/python/README.md
index 95fdfe8cf2..2d0beca053 100644
--- a/examples/python/README.md
+++ b/examples/python/README.md
@@ -1,19 +1,15 @@
 ## Support LLava-Interleave-Qwen-7B-hf
 
-### AWQ
+### generate gemm config (Optional)
 
-lmdeploy lite auto_awq --work_dir models/llava-interleave-qwen-7b-hf/awq models/llava-interleave-qwen-7b-hf
-lmdeploy serve api_server models/llava-interleave-qwen-7b-hf/awq --model-format awq 
+`python3 lmdeploy/turbomind/generate_gemm_config.py --tensor-para-size 1 --max-batch-size 4 --model-path /models/llava-interleave-qwen-7b-hf`
 
+### generate awq format model(Optional for awq format)
 
-### Offline
+`lmdeploy lite auto_awq --work_dir models/llava-interleave-qwen-7b-hf/awq models/llava-interleave-qwen-7b-hf`
 
-python3 offline_vl.py models/llava-interleave-qwen-7b-hf
+### start server
 
-python3 offline_vl.py models/llava-interleave-qwen-7b-hf/awq --model-format awq
+`python3 offline_vl.py models/llava-interleave-qwen-7b-hf`
 
-```text
-Response(text="The image is a photograph capturing a moment between a person and a dog on a sandy beach. The person is seated on the sand, wearing a plaid shirt and pants, with their legs crossed. They are holding a small object in their hand, which appears to be a toy or a small treat, and are extending their hand towards the dog. The dog, which is standing on the sand, has its front paws raised towards the person's hand, suggesting an interaction or a gesture of play or gratitude. The dog is wearing a colorful harness with a pattern that includes blue, red, and yellow colors. The background features a calm sea with gentle waves lapping at the shore, and the sky is clear with a soft gradient from light to darker blue, indicating either sunrise or sunset. The lighting in the photograph is warm, contributing to the serene atmosphere of the scene. There are no visible texts or brands in the image.", generate_token_len=187, input_token_len=753, session_id=0, finish_reason='stop', token_ids=[785, 2168, 374, 264, 10300, 39780, 264, 4445, 1948, 264, 1697, 323, 264, 5562, 389, 264, 67439, 11321, 13, 576, 1697, 374, 46313, 389, 279, 9278, 11, 12233, 264, 625, 3779, 15478, 323, 24549, 11, 448, 862, 14201, 27031, 13, 2379, 525, 9963, 264, 2613, 1633, 304, 862, 1424, 11, 892, 7952, 311, 387, 264, 21357, 476, 264, 2613, 4228, 11, 323, 525, 32359, 862, 1424, 6974, 279, 5562, 13, 576, 5562, 11, 892, 374, 11259, 389, 279, 9278, 11, 702, 1181, 4065, 281, 8635, 9226, 6974, 279, 1697, 594, 1424, 11, 22561, 458, 16230, 476, 264, 30157, 315, 1486, 476, 45035, 13, 576, 5562, 374, 12233, 264, 33866, 32408, 448, 264, 5383, 429, 5646, 6303, 11, 2518, 11, 323, 13753, 7987, 13, 576, 4004, 4419, 264, 19300, 9396, 448, 21700, 16876, 326, 3629, 518, 279, 30184, 11, 323, 279, 12884, 374, 2797, 448, 264, 8413, 20169, 504, 3100, 311, 39030, 6303, 11, 18860, 2987, 63819, 476, 42984, 13, 576, 17716, 304, 279, 10300, 374, 8205, 11, 28720, 311, 279, 94763, 16566, 315, 279, 6109, 13, 2619, 525, 902, 9434, 21984, 476, 15721, 304, 279, 2168, 13], logprobs=None, index=0)
-prompt:How many people in the image?
-Response(text='There is one person in the image.', generate_token_len=8, input_token_len=756, session_id=1, finish_reason='stop', token_ids=[3862, 374, 825, 1697, 304, 279, 2168, 13], logprobs=None, index=0)
-```
\ No newline at end of file
+`python3 offline_vl.py models/llava-interleave-qwen-7b-hf/awq --model-format awq`
\ No newline at end of file
diff --git a/lmdeploy/turbomind/generate_gemm_config.py b/lmdeploy/turbomind/generate_gemm_config.py
index 91b057d723..b697218fd7 100644
--- a/lmdeploy/turbomind/generate_gemm_config.py
+++ b/lmdeploy/turbomind/generate_gemm_config.py
@@ -54,10 +54,22 @@ def main(head_num: int = 32,
             from transformers import AutoConfig
             config = AutoConfig.from_pretrained(model_path,
                                                 trust_remote_code=True)
-            head_num = config.num_attention_heads
-            size_per_head = config.hidden_size // head_num
-            inter_size = config.intermediate_size
-            vocab_size = config.vocab_size
+            try:
+                head_num = config.num_attention_heads
+                size_per_head = config.hidden_size // head_num
+                inter_size = config.intermediate_size
+                vocab_size = config.vocab_size
+            except AttributeError as e:
+                if hasattr(config, "text_config"):
+                    config = config.text_config
+                elif hasattr(config, "llm_config"):
+                    config = config.llm_config
+                else:
+                    raise AttributeError(f"not found attribute in {config}, please check your model config file. {e}")
+                head_num = config.num_attention_heads
+                size_per_head = config.hidden_size // head_num
+                inter_size = config.intermediate_size
+                vocab_size = config.vocab_size
     for bsz in range(1, max_batch_size + 1):
         subprocess.call(
             f'{get_llama_gemm()} {bsz} 1 1 {head_num} {size_per_head}'

From 20c7476594ede66fa48ef4377ca73932a322a59c Mon Sep 17 00:00:00 2001
From: Willow <deepindeed2022@gmail.com>
Date: Tue, 5 Nov 2024 02:44:55 +0000
Subject: [PATCH 3/8] lint: fix lint warning

---
 examples/python/README.md                     |  2 +-
 examples/python/offline_vl.py                 | 46 ++++++++++---------
 .../turbomind/deploy/source_model/__init__.py |  2 +-
 .../deploy/source_model/llava_qwen2.py        | 15 +++---
 lmdeploy/turbomind/generate_gemm_config.py    |  7 +--
 lmdeploy/turbomind/supported_models.py        |  2 +-
 6 files changed, 38 insertions(+), 36 deletions(-)

diff --git a/examples/python/README.md b/examples/python/README.md
index 2d0beca053..68f5c46c7d 100644
--- a/examples/python/README.md
+++ b/examples/python/README.md
@@ -12,4 +12,4 @@
 
 `python3 offline_vl.py models/llava-interleave-qwen-7b-hf`
 
-`python3 offline_vl.py models/llava-interleave-qwen-7b-hf/awq --model-format awq`
\ No newline at end of file
+`python3 offline_vl.py models/llava-interleave-qwen-7b-hf/awq --model-format awq`
diff --git a/examples/python/offline_vl.py b/examples/python/offline_vl.py
index 54d881e013..a28a8cbeeb 100644
--- a/examples/python/offline_vl.py
+++ b/examples/python/offline_vl.py
@@ -1,32 +1,34 @@
-from lmdeploy import pipeline, TurbomindEngineConfig, GenerationConfig
-from lmdeploy.vl import load_image
 import argparse
 
-if __name__ == "__main__":
+from lmdeploy import GenerationConfig, TurbomindEngineConfig, pipeline
+from lmdeploy.vl import load_image
+
+if __name__ == '__main__':
     parser = argparse.ArgumentParser(description='test model')
     parser.add_argument('model_path',
                         type=str,
                         help='the path of the model in localhost or '
                         'the repo_id of the model in huggingface.co',
-                        default="llava-hf/llava-interleave-qwen-7b-hf")
-    parser.add_argument(
-        '--model-format',
-        type=str,
-        help='model format',
-        default='hf',
-        choices=['hf', 'awq'])
-    parser.add_argument(
-        '--max-new-tokens',
-        type=int,
-        help='output max tokens number',
-        default=128)
+                        default='llava-hf/llava-interleave-qwen-7b-hf')
+    parser.add_argument('--model-format',
+                        type=str,
+                        help='model format',
+                        default='hf',
+                        choices=['hf', 'awq'])
+    parser.add_argument('--max-new-tokens',
+                        type=int,
+                        help='output max tokens number',
+                        default=128)
     args = parser.parse_args()
-    pipe = pipeline(args.model_path, 
-                    backend_config=TurbomindEngineConfig(cache_max_entry_count=0.5, model_format=args.model_format), 
-                    gen_config=GenerationConfig(max_new_tokens=args.max_new_tokens))
+    pipe = pipeline(
+        args.model_path,
+        backend_config=TurbomindEngineConfig(cache_max_entry_count=0.5,
+                                             model_format=args.model_format),
+        gen_config=GenerationConfig(max_new_tokens=args.max_new_tokens))
 
-    image = load_image('https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg')
-    for prompt in ["Describe the image.", "How many people in the image?"]:
-        print(f"prompt:{prompt}")
+    image = load_image('https://qianwen-res.oss-cn-beijing.aliyuncs.com/' +
+                       'Qwen-VL/assets/demo.jpeg')
+    for prompt in ['Describe the image.', 'How many people in the image?']:
+        print(f'prompt:{prompt}')
         response = pipe((prompt, image))
-        print(response)
\ No newline at end of file
+        print(response)
diff --git a/lmdeploy/turbomind/deploy/source_model/__init__.py b/lmdeploy/turbomind/deploy/source_model/__init__.py
index dba7a706a5..b45db99c7c 100644
--- a/lmdeploy/turbomind/deploy/source_model/__init__.py
+++ b/lmdeploy/turbomind/deploy/source_model/__init__.py
@@ -5,9 +5,9 @@
 from .internlm2 import InternLM2Model  # noqa: F401
 from .internvl import InternVLModel  # noqa: F401
 from .llama import LlamaModel  # noqa: F401
+from .llava_qwen2 import LlavaQwen2Model  # noqa: F401
 from .meta_llama import MetaLlamaModel  # noqa: F401
 from .minicpmv import MiniCPMVModel  # noqa: F401
 from .mixtral import MixtralModel  # noqa: F401
 from .qwen import QwenModel  # noqa: F401
 from .xcomposer2 import Xcomposer2Model  # noqa: F401
-from .llava_qwen2 import LlavaQwen2Model  # noqa: F401
diff --git a/lmdeploy/turbomind/deploy/source_model/llava_qwen2.py b/lmdeploy/turbomind/deploy/source_model/llava_qwen2.py
index 16d662cf06..8077c2c250 100644
--- a/lmdeploy/turbomind/deploy/source_model/llava_qwen2.py
+++ b/lmdeploy/turbomind/deploy/source_model/llava_qwen2.py
@@ -1,8 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import json
 import os.path as osp
 
 from .base import INPUT_MODELS
-from .llama import LlamaReader, LlamaModel
+from .llama import LlamaModel, LlamaReader
 
 
 class LlavaQwen2Reader(LlamaReader):
@@ -17,7 +18,8 @@ class LlavaQwen2Reader(LlamaReader):
     def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
                  model_cfg: dict, policy):
         model_cfg = model_cfg.get('text_config')
-        super().__init__(new_params, unused_params, last_bin, model_cfg, policy)
+        super().__init__(new_params, unused_params, last_bin, model_cfg,
+                         policy)
 
 
 @INPUT_MODELS.register_module(name='llava_qwen2')
@@ -26,9 +28,6 @@ class LlavaQwen2Model(LlamaModel):
 
     def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
         super().__init__(model_path, tokenizer_path, **kwargs)
-        from transformers import AutoConfig
-        config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
-        arch = config.text_config.architectures[0]
         self.Reader = LlavaQwen2Reader
 
     def model_info(self):
@@ -53,7 +52,7 @@ def model_info(self):
             # special for the model: llava-hf/llava-interleave-qwen-7b-hf
             hidden_units = model_arg.get('hidden_size', 4096)
             vocab_size = model_arg.get('vocab_size', 152000)
-            intermediate_size = model_arg.get("intermediate_size", 11008)
+            intermediate_size = model_arg.get('intermediate_size', 11008)
             attn_bias = int(model_arg.get('attn_bias', 1))
             use_logn_attn = int(model_arg.get('use_logn_attn', 0))
 
@@ -73,6 +72,6 @@ def model_info(self):
                     use_dynamic_ntk=use_dynamic_ntk,
                     rope_scaling_factor=scaling_factor,
                     inter_size=intermediate_size,
-                    use_logn_attn = use_logn_attn,
+                    use_logn_attn=use_logn_attn,
                     attn_bias=attn_bias,
-                    vocab_size=vocab_size)
\ No newline at end of file
+                    vocab_size=vocab_size)
diff --git a/lmdeploy/turbomind/generate_gemm_config.py b/lmdeploy/turbomind/generate_gemm_config.py
index b697218fd7..83740b2c79 100644
--- a/lmdeploy/turbomind/generate_gemm_config.py
+++ b/lmdeploy/turbomind/generate_gemm_config.py
@@ -60,12 +60,13 @@ def main(head_num: int = 32,
                 inter_size = config.intermediate_size
                 vocab_size = config.vocab_size
             except AttributeError as e:
-                if hasattr(config, "text_config"):
+                if hasattr(config, 'text_config'):
                     config = config.text_config
-                elif hasattr(config, "llm_config"):
+                elif hasattr(config, 'llm_config'):
                     config = config.llm_config
                 else:
-                    raise AttributeError(f"not found attribute in {config}, please check your model config file. {e}")
+                    raise AttributeError(f'not found attribute in {config},\
+                            please check your model config file.{e}')
                 head_num = config.num_attention_heads
                 size_per_head = config.hidden_size // head_num
                 inter_size = config.intermediate_size
diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py
index cef4d57457..7342abe90f 100644
--- a/lmdeploy/turbomind/supported_models.py
+++ b/lmdeploy/turbomind/supported_models.py
@@ -26,7 +26,7 @@
     LlavaLlamaForCausalLM='llama',
     LlavaMistralForCausalLM='llama',
     # Llava_interleave
-    LlavaForConditionalGeneration="llava_qwen2",
+    LlavaForConditionalGeneration='llava_qwen2',
     # xcomposer2
     InternLMXComposer2ForCausalLM='xcomposer2',
     # internvl

From 2631814b2cc6b1a8c2be064fe9b57b8880cea904 Mon Sep 17 00:00:00 2001
From: Willow <deepindeed2022@gmail.com>
Date: Tue, 5 Nov 2024 09:09:22 +0000
Subject: [PATCH 4/8] doc: presenting the usage in the user guide

---
 docs/en/multi_modal/llava_qwen.md | 172 ++++++++++++++++++++++++++++++
 examples/python/README.md         |  15 ---
 examples/python/offline_vl.py     |  34 ------
 3 files changed, 172 insertions(+), 49 deletions(-)
 create mode 100644 docs/en/multi_modal/llava_qwen.md
 delete mode 100644 examples/python/README.md
 delete mode 100644 examples/python/offline_vl.py

diff --git a/docs/en/multi_modal/llava_qwen.md b/docs/en/multi_modal/llava_qwen.md
new file mode 100644
index 0000000000..0c91fade7c
--- /dev/null
+++ b/docs/en/multi_modal/llava_qwen.md
@@ -0,0 +1,172 @@
+# Llava-Qwen2
+
+LMDeploy supports the following llava-qwen2 series of models, which are detailed in the table below:
+
+|            Model            | Size | Supported Inference Engine |
+| :-------------------------: | :--: | :------------------------: |
+| Llava-interleave-qwen-7b-hf |  7B  |         TurboMind          |
+
+The next chapter demonstrates how to deploy an LlavaQwen2 model using LMDeploy, with [LlavaQwen2](https://huggingface.co/llava-hf/llava-interleave-qwen-7b-hf) as an example.
+
+## Installation
+
+Please install LMDeploy by following the [installation guide](../get_started/installation.md).
+
+Or, you can build a docker image to set up the inference environment. If the CUDA version on your host machine is `>=12.4`, you can run:
+
+```
+git clone https://github.com/InternLM/lmdeploy.git
+cd lmdeploy
+docker build --build-arg CUDA_VERSION=cu12 -t openmmlab/lmdeploy:llava_qwen2 . -f ./docker/Dockerfile
+```
+
+Otherwise, you can go with:
+
+```shell
+docker build --build-arg CUDA_VERSION=cu11 -t openmmlab/lmdeploy:llava_qwen2 . -f ./docker/Dockerfile
+```
+
+## Offline inference
+
+The following sample code shows the basic usage of VLM pipeline. For detailed information, please refer to [VLM Offline Inference Pipeline](./vl_pipeline.md)
+
+```python
+from lmdeploy import GenerationConfig, TurbomindEngineConfig, pipeline
+from lmdeploy.vl import load_image
+
+
+pipe = pipeline("llava-hf/llava-interleave-qwen-7b-hf", backend_config=TurbomindEngineConfig(cache_max_entry_count=0.5),
+    gen_config=GenerationConfig(max_new_tokens=512))
+
+image = load_image('https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg')
+prompt = 'Describe the image.'
+print(f'prompt:{prompt}')
+response = pipe((prompt, image))
+print(response)
+
+```
+
+More examples are listed below:
+
+<details>
+  <summary>
+    <b>multi-image multi-round conversation, combined images</b>
+  </summary>
+
+```python
+from lmdeploy import pipeline, GenerationConfig
+
+pipe = pipeline('llava-hf/llava-interleave-qwen-7b-hf', log_level='INFO')
+messages = [
+    dict(role='user', content=[
+        dict(type='text', text='Describe the two images in detail.'),
+        dict(type='image_url', image_url=dict(url='https://raw.githubusercontent.com/QwenLM/Qwen-VL/master/assets/mm_tutorial/Beijing_Small.jpeg')),
+        dict(type='image_url', image_url=dict(url='https://raw.githubusercontent.com/QwenLM/Qwen-VL/master/assets/mm_tutorial/Chongqing_Small.jpeg'))
+    ])
+]
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+
+messages.append(dict(role='assistant', content=out.text))
+messages.append(dict(role='user', content='What are the similarities and differences between these two images.'))
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+```
+
+</details>
+
+<details>
+  <summary>
+    <b>image resolution for performance boost</b>
+  </summary>
+
+```python
+from lmdeploy import pipeline, GenerationConfig
+
+pipe = pipeline('llava-hf/llava-interleave-qwen-7b-hf', log_level='INFO')
+
+min_pixels = 64 * 28 * 28
+max_pixels = 64 * 28 * 28
+messages = [
+    dict(role='user', content=[
+        dict(type='text', text='Describe the two images in detail.'),
+        dict(type='image_url', image_url=dict(min_pixels=min_pixels, max_pixels=max_pixels, url='https://raw.githubusercontent.com/QwenLM/Qwen-VL/master/assets/mm_tutorial/Beijing_Small.jpeg')),
+        dict(type='image_url', image_url=dict(min_pixels=min_pixels, max_pixels=max_pixels, url='https://raw.githubusercontent.com/QwenLM/Qwen-VL/master/assets/mm_tutorial/Chongqing_Small.jpeg'))
+    ])
+]
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+
+messages.append(dict(role='assistant', content=out.text))
+messages.append(dict(role='user', content='What are the similarities and differences between these two images.'))
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+```
+
+</details>
+
+## Online serving
+
+You can launch the server by the `lmdeploy serve api_server` CLI:
+
+```shell
+lmdeploy serve api_server llava-hf/llava-interleave-qwen-7b-hf
+```
+
+You can also start the service using the aforementioned built docker image:
+
+```shell
+docker run --runtime nvidia --gpus all \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
+    -p 23333:23333 \
+    --ipc=host \
+    openmmlab/lmdeploy:llava_qwen2 \
+    lmdeploy serve api_server llava-hf/llava-interleave-qwen-7b-hf
+```
+
+The docker compose is another option. Create a `docker-compose.yml` configuration file in the root directory of the lmdeploy project as follows:
+
+```yaml
+version: '3.5'
+
+services:
+  lmdeploy:
+    container_name: lmdeploy
+    image: openmmlab/lmdeploy:llava_qwen2
+    ports:
+      - "23333:23333"
+    environment:
+      HUGGING_FACE_HUB_TOKEN: <secret>
+    volumes:
+      - ~/.cache/huggingface:/root/.cache/huggingface
+    stdin_open: true
+    tty: true
+    ipc: host
+    command: lmdeploy serve api_server llava-hf/llava-interleave-qwen-7b-hf
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: "all"
+              capabilities: [gpu]
+```
+
+Then, you can execute the startup command as below:
+
+```shell
+docker-compose up -d
+```
+
+If you find the following logs after running `docker logs -f lmdeploy`, it means the service launches successfully.
+
+```text
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+INFO:     Started server process [2439]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on  http://0.0.0.0:23333  (Press CTRL+C to quit)
+```
+
+The arguments of `lmdeploy serve api_server` can be reviewed in detail by `lmdeploy serve api_server -h`.
+
+More information about `api_server` as well as how to access the service can be found from [here](api_server_vl.md)
diff --git a/examples/python/README.md b/examples/python/README.md
deleted file mode 100644
index 68f5c46c7d..0000000000
--- a/examples/python/README.md
+++ /dev/null
@@ -1,15 +0,0 @@
-## Support LLava-Interleave-Qwen-7B-hf
-
-### generate gemm config (Optional)
-
-`python3 lmdeploy/turbomind/generate_gemm_config.py --tensor-para-size 1 --max-batch-size 4 --model-path /models/llava-interleave-qwen-7b-hf`
-
-### generate awq format model(Optional for awq format)
-
-`lmdeploy lite auto_awq --work_dir models/llava-interleave-qwen-7b-hf/awq models/llava-interleave-qwen-7b-hf`
-
-### start server
-
-`python3 offline_vl.py models/llava-interleave-qwen-7b-hf`
-
-`python3 offline_vl.py models/llava-interleave-qwen-7b-hf/awq --model-format awq`
diff --git a/examples/python/offline_vl.py b/examples/python/offline_vl.py
deleted file mode 100644
index a28a8cbeeb..0000000000
--- a/examples/python/offline_vl.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import argparse
-
-from lmdeploy import GenerationConfig, TurbomindEngineConfig, pipeline
-from lmdeploy.vl import load_image
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='test model')
-    parser.add_argument('model_path',
-                        type=str,
-                        help='the path of the model in localhost or '
-                        'the repo_id of the model in huggingface.co',
-                        default='llava-hf/llava-interleave-qwen-7b-hf')
-    parser.add_argument('--model-format',
-                        type=str,
-                        help='model format',
-                        default='hf',
-                        choices=['hf', 'awq'])
-    parser.add_argument('--max-new-tokens',
-                        type=int,
-                        help='output max tokens number',
-                        default=128)
-    args = parser.parse_args()
-    pipe = pipeline(
-        args.model_path,
-        backend_config=TurbomindEngineConfig(cache_max_entry_count=0.5,
-                                             model_format=args.model_format),
-        gen_config=GenerationConfig(max_new_tokens=args.max_new_tokens))
-
-    image = load_image('https://qianwen-res.oss-cn-beijing.aliyuncs.com/' +
-                       'Qwen-VL/assets/demo.jpeg')
-    for prompt in ['Describe the image.', 'How many people in the image?']:
-        print(f'prompt:{prompt}')
-        response = pipe((prompt, image))
-        print(response)

From 6323319c8f5fdcbd4a96301fa33c65bf987d5f46 Mon Sep 17 00:00:00 2001
From: Willow <deepindeed2022@gmail.com>
Date: Thu, 7 Nov 2024 03:22:09 +0000
Subject: [PATCH 5/8] resolve conflict issue and refactor for better design

---
 docs/en/multi_modal/llava.md                  | 138 +++++++++++++++++-
 docs/en/multi_modal/llava_qwen.md             | 136 -----------------
 .../turbomind/deploy/source_model/__init__.py |   2 +-
 .../source_model/{llava_qwen2.py => llava.py} |  24 ++-
 lmdeploy/turbomind/generate_gemm_config.py    |  12 +-
 lmdeploy/turbomind/supported_models.py        |   7 +-
 6 files changed, 162 insertions(+), 157 deletions(-)
 delete mode 100644 docs/en/multi_modal/llava_qwen.md
 rename lmdeploy/turbomind/deploy/source_model/{llava_qwen2.py => llava.py} (78%)

diff --git a/docs/en/multi_modal/llava.md b/docs/en/multi_modal/llava.md
index cf95e15d5c..8f052227d5 100644
--- a/docs/en/multi_modal/llava.md
+++ b/docs/en/multi_modal/llava.md
@@ -1,3 +1,139 @@
 # LLaVA
 
-TODO
+LMDeploy supports the following llava series of models, which are detailed in the table below:
+
+|                Model                 | Size | Supported Inference Engine |
+| :----------------------------------: | :--: | :------------------------: |
+| llava-hf/Llava-interleave-qwen-7b-hf |  7B  |     TurboMind, PyTorch     |
+|       llava-hf/llava-1.5-7b-hf       |  7B  |     TurboMind, PyTorch     |
+|   liuhaotian/llava-v1.6-vicuna-7b    |  7B  |     TurboMind, PyTorch     |
+|   liuhaotian/llava-v1.6-mistral-7b   |  7B  |     TurboMind, PyTorch     |
+
+The next chapter demonstrates how to deploy an Llava model using LMDeploy, with [llava-hf/llava-interleave](https://huggingface.co/llava-hf/llava-interleave-qwen-7b-hf) as an example.
+
+## Installation
+
+Please install LMDeploy by following the [installation guide](../get_started/installation.md).
+
+Or, you can go with office docker image:
+
+```shell
+docker pull openmmlab/lmdeploy:latest
+```
+
+## Offline inference
+
+The following sample code shows the basic usage of VLM pipeline. For detailed information, please refer to [VLM Offline Inference Pipeline](./vl_pipeline.md)
+
+```python
+from lmdeploy import GenerationConfig, TurbomindEngineConfig, pipeline
+from lmdeploy.vl import load_image
+
+
+pipe = pipeline("llava-hf/llava-interleave-qwen-7b-hf", backend_config=TurbomindEngineConfig(cache_max_entry_count=0.5),
+    gen_config=GenerationConfig(max_new_tokens=512))
+
+image = load_image('https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg')
+prompt = 'Describe the image.'
+print(f'prompt:{prompt}')
+response = pipe((prompt, image))
+print(response)
+
+```
+
+More examples are listed below:
+
+<details>
+  <summary>
+    <b>multi-image multi-round conversation, combined images</b>
+  </summary>
+
+```python
+from lmdeploy import pipeline, GenerationConfig
+
+pipe = pipeline('llava-hf/llava-interleave-qwen-7b-hf', log_level='INFO')
+messages = [
+    dict(role='user', content=[
+        dict(type='text', text='Describe the two images in detail.'),
+        dict(type='image_url', image_url=dict(url='https://raw.githubusercontent.com/QwenLM/Qwen-VL/master/assets/mm_tutorial/Beijing_Small.jpeg')),
+        dict(type='image_url', image_url=dict(url='https://raw.githubusercontent.com/QwenLM/Qwen-VL/master/assets/mm_tutorial/Chongqing_Small.jpeg'))
+    ])
+]
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+
+messages.append(dict(role='assistant', content=out.text))
+messages.append(dict(role='user', content='What are the similarities and differences between these two images.'))
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+```
+
+</details>
+
+## Online serving
+
+You can launch the server by the `lmdeploy serve api_server` CLI:
+
+```shell
+lmdeploy serve api_server llava-hf/llava-interleave-qwen-7b-hf
+```
+
+You can also start the service using the aforementioned built docker image:
+
+```shell
+docker run --runtime nvidia --gpus all \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
+    -p 23333:23333 \
+    --ipc=host \
+    openmmlab/lmdeploy:latest \
+    lmdeploy serve api_server llava-hf/llava-interleave-qwen-7b-hf
+```
+
+The docker compose is another option. Create a `docker-compose.yml` configuration file in the root directory of the lmdeploy project as follows:
+
+```yaml
+version: '3.5'
+
+services:
+  lmdeploy:
+    container_name: lmdeploy
+    image: openmmlab/lmdeploy:latest
+    ports:
+      - "23333:23333"
+    environment:
+      HUGGING_FACE_HUB_TOKEN: <secret>
+    volumes:
+      - ~/.cache/huggingface:/root/.cache/huggingface
+    stdin_open: true
+    tty: true
+    ipc: host
+    command: lmdeploy serve api_server llava-hf/llava-interleave-qwen-7b-hf
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: "all"
+              capabilities: [gpu]
+```
+
+Then, you can execute the startup command as below:
+
+```shell
+docker-compose up -d
+```
+
+If you find the following logs after running `docker logs -f lmdeploy`, it means the service launches successfully.
+
+```text
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+INFO:     Started server process [2439]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on  http://0.0.0.0:23333  (Press CTRL+C to quit)
+```
+
+The arguments of `lmdeploy serve api_server` can be reviewed in detail by `lmdeploy serve api_server -h`.
+
+More information about `api_server` as well as how to access the service can be found from [here](api_server_vl.md)
diff --git a/docs/en/multi_modal/llava_qwen.md b/docs/en/multi_modal/llava_qwen.md
deleted file mode 100644
index 4f8e3e51e4..0000000000
--- a/docs/en/multi_modal/llava_qwen.md
+++ /dev/null
@@ -1,136 +0,0 @@
-# Llava-Qwen2
-
-LMDeploy supports the following llava-qwen2 series of models, which are detailed in the table below:
-
-|            Model            | Size | Supported Inference Engine |
-| :-------------------------: | :--: | :------------------------: |
-| Llava-interleave-qwen-7b-hf |  7B  |         TurboMind          |
-
-The next chapter demonstrates how to deploy an LlavaQwen2 model using LMDeploy, with [LlavaQwen2](https://huggingface.co/llava-hf/llava-interleave-qwen-7b-hf) as an example.
-
-## Installation
-
-Please install LMDeploy by following the [installation guide](../get_started/installation.md).
-
-Or, you can go with office docker image:
-
-```shell
-docker pull openmmlab/lmdeploy:latest
-```
-
-## Offline inference
-
-The following sample code shows the basic usage of VLM pipeline. For detailed information, please refer to [VLM Offline Inference Pipeline](./vl_pipeline.md)
-
-```python
-from lmdeploy import GenerationConfig, TurbomindEngineConfig, pipeline
-from lmdeploy.vl import load_image
-
-
-pipe = pipeline("llava-hf/llava-interleave-qwen-7b-hf", backend_config=TurbomindEngineConfig(cache_max_entry_count=0.5),
-    gen_config=GenerationConfig(max_new_tokens=512))
-
-image = load_image('https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg')
-prompt = 'Describe the image.'
-print(f'prompt:{prompt}')
-response = pipe((prompt, image))
-print(response)
-
-```
-
-More examples are listed below:
-
-<details>
-  <summary>
-    <b>multi-image multi-round conversation, combined images</b>
-  </summary>
-
-```python
-from lmdeploy import pipeline, GenerationConfig
-
-pipe = pipeline('llava-hf/llava-interleave-qwen-7b-hf', log_level='INFO')
-messages = [
-    dict(role='user', content=[
-        dict(type='text', text='Describe the two images in detail.'),
-        dict(type='image_url', image_url=dict(url='https://raw.githubusercontent.com/QwenLM/Qwen-VL/master/assets/mm_tutorial/Beijing_Small.jpeg')),
-        dict(type='image_url', image_url=dict(url='https://raw.githubusercontent.com/QwenLM/Qwen-VL/master/assets/mm_tutorial/Chongqing_Small.jpeg'))
-    ])
-]
-out = pipe(messages, gen_config=GenerationConfig(top_k=1))
-
-messages.append(dict(role='assistant', content=out.text))
-messages.append(dict(role='user', content='What are the similarities and differences between these two images.'))
-out = pipe(messages, gen_config=GenerationConfig(top_k=1))
-```
-
-</details>
-
-## Online serving
-
-You can launch the server by the `lmdeploy serve api_server` CLI:
-
-```shell
-lmdeploy serve api_server llava-hf/llava-interleave-qwen-7b-hf
-```
-
-You can also start the service using the aforementioned built docker image:
-
-```shell
-docker run --runtime nvidia --gpus all \
-    -v ~/.cache/huggingface:/root/.cache/huggingface \
-    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
-    -p 23333:23333 \
-    --ipc=host \
-    openmmlab/lmdeploy:llava_qwen2 \
-    lmdeploy serve api_server llava-hf/llava-interleave-qwen-7b-hf
-```
-
-The docker compose is another option. Create a `docker-compose.yml` configuration file in the root directory of the lmdeploy project as follows:
-
-```yaml
-version: '3.5'
-
-services:
-  lmdeploy:
-    container_name: lmdeploy
-    image: openmmlab/lmdeploy:llava_qwen2
-    ports:
-      - "23333:23333"
-    environment:
-      HUGGING_FACE_HUB_TOKEN: <secret>
-    volumes:
-      - ~/.cache/huggingface:/root/.cache/huggingface
-    stdin_open: true
-    tty: true
-    ipc: host
-    command: lmdeploy serve api_server llava-hf/llava-interleave-qwen-7b-hf
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              count: "all"
-              capabilities: [gpu]
-```
-
-Then, you can execute the startup command as below:
-
-```shell
-docker-compose up -d
-```
-
-If you find the following logs after running `docker logs -f lmdeploy`, it means the service launches successfully.
-
-```text
-HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
-HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
-HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
-INFO:     Started server process [2439]
-INFO:     Waiting for application startup.
-INFO:     Application startup complete.
-INFO:     Uvicorn running on  http://0.0.0.0:23333  (Press CTRL+C to quit)
-```
-
-The arguments of `lmdeploy serve api_server` can be reviewed in detail by `lmdeploy serve api_server -h`.
-
-More information about `api_server` as well as how to access the service can be found from [here](api_server_vl.md)
diff --git a/lmdeploy/turbomind/deploy/source_model/__init__.py b/lmdeploy/turbomind/deploy/source_model/__init__.py
index b45db99c7c..b1da698e2e 100644
--- a/lmdeploy/turbomind/deploy/source_model/__init__.py
+++ b/lmdeploy/turbomind/deploy/source_model/__init__.py
@@ -5,7 +5,7 @@
 from .internlm2 import InternLM2Model  # noqa: F401
 from .internvl import InternVLModel  # noqa: F401
 from .llama import LlamaModel  # noqa: F401
-from .llava_qwen2 import LlavaQwen2Model  # noqa: F401
+from .llava import LlavaModel  # noqa: F401
 from .meta_llama import MetaLlamaModel  # noqa: F401
 from .minicpmv import MiniCPMVModel  # noqa: F401
 from .mixtral import MixtralModel  # noqa: F401
diff --git a/lmdeploy/turbomind/deploy/source_model/llava_qwen2.py b/lmdeploy/turbomind/deploy/source_model/llava.py
similarity index 78%
rename from lmdeploy/turbomind/deploy/source_model/llava_qwen2.py
rename to lmdeploy/turbomind/deploy/source_model/llava.py
index 8077c2c250..0902468a77 100644
--- a/lmdeploy/turbomind/deploy/source_model/llava_qwen2.py
+++ b/lmdeploy/turbomind/deploy/source_model/llava.py
@@ -6,8 +6,8 @@
 from .llama import LlamaModel, LlamaReader
 
 
-class LlavaQwen2Reader(LlamaReader):
-    """LlavaQwen2Reader for llama model."""
+class LlavaReader(LlamaReader):
+    """LlavaReader for llama model."""
 
     attn_layer_prefix = 'language_model.model.layers'
     attn_layer_patten = r'language_model.model.layers.([0-9]+).'
@@ -22,16 +22,26 @@ def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
                          policy)
 
 
-@INPUT_MODELS.register_module(name='llava_qwen2')
-class LlavaQwen2Model(LlamaModel):
-    """LlavaQwen2Model model in hf format."""
+@INPUT_MODELS.register_module(name='llava')
+class LlavaModel(LlamaModel):
+    """LlavaModel model in hf format."""
 
     def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
         super().__init__(model_path, tokenizer_path, **kwargs)
-        self.Reader = LlavaQwen2Reader
+        from transformers import AutoConfig
+        config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+        arch = config.architectures[0]
+        _readers = dict(LlavaForConditionalGeneration=LlavaReader,
+                        LlavaMistralForCausalLM=LlamaReader,
+                        LlavaLlamaForCausalLM=LlamaReader)
+        self.Reader = _readers[arch]
+        self.arch = arch
 
     def model_info(self):
-        """Read model info."""
+        if self.arch in ['LlavaMistralForCausalLM', 'LlavaLlamaForCausalLM']:
+            return super().model_info()
+        """Read model info for LlavaForConditionalGeneration.
+        https://huggingface.co/llava-hf/llava-interleave-qwen-7b-hf"""
         params_path = osp.join(self.model_path, 'config.json')
         with open(params_path) as f:
             model_arg = json.load(f)['text_config']
diff --git a/lmdeploy/turbomind/generate_gemm_config.py b/lmdeploy/turbomind/generate_gemm_config.py
index 83740b2c79..a7689ebc27 100644
--- a/lmdeploy/turbomind/generate_gemm_config.py
+++ b/lmdeploy/turbomind/generate_gemm_config.py
@@ -59,14 +59,10 @@ def main(head_num: int = 32,
                 size_per_head = config.hidden_size // head_num
                 inter_size = config.intermediate_size
                 vocab_size = config.vocab_size
-            except AttributeError as e:
-                if hasattr(config, 'text_config'):
-                    config = config.text_config
-                elif hasattr(config, 'llm_config'):
-                    config = config.llm_config
-                else:
-                    raise AttributeError(f'not found attribute in {config},\
-                            please check your model config file.{e}')
+            except AttributeError:
+                for key in ['language_config', 'llm_config', 'text_config']:
+                    config = getattr(config, key, config)
+
                 head_num = config.num_attention_heads
                 size_per_head = config.hidden_size // head_num
                 inter_size = config.intermediate_size
diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py
index 2e97e0f79b..17f8edf22c 100644
--- a/lmdeploy/turbomind/supported_models.py
+++ b/lmdeploy/turbomind/supported_models.py
@@ -23,10 +23,9 @@
     # mistral
     MistralForCausalLM='llama',
     # llava
-    LlavaLlamaForCausalLM='llama',
-    LlavaMistralForCausalLM='llama',
-    # Llava_interleave
-    LlavaForConditionalGeneration='llava_qwen2',
+    LlavaLlamaForCausalLM='llava',
+    LlavaMistralForCausalLM='llava',
+    LlavaForConditionalGeneration='llava',
     # xcomposer2
     InternLMXComposer2ForCausalLM='xcomposer2',
     # internvl

From e5005f55e803858f586e69f369356de1950852f4 Mon Sep 17 00:00:00 2001
From: Willow <deepindeed2022@gmail.com>
Date: Fri, 8 Nov 2024 02:52:21 +0000
Subject: [PATCH 6/8] fix and doc: - fix tune attribute error - add chinese
 llava doc

---
 docs/zh_cn/multi_modal/llava.md            | 134 ++++++++++++++++++++-
 lmdeploy/turbomind/generate_gemm_config.py |  21 ++--
 lmdeploy/turbomind/supported_models.py     |   5 +-
 3 files changed, 145 insertions(+), 15 deletions(-)

diff --git a/docs/zh_cn/multi_modal/llava.md b/docs/zh_cn/multi_modal/llava.md
index cf95e15d5c..c40f37308a 100644
--- a/docs/zh_cn/multi_modal/llava.md
+++ b/docs/zh_cn/multi_modal/llava.md
@@ -1,3 +1,135 @@
 # LLaVA
 
-TODO
+LMDeploy 支持以下 LLaVA 系列模型，具体如下表所示：
+
+|                 模型                 | 大小 |   支持的推理引擎   |
+| :----------------------------------: | :--: | :----------------: |
+| llava-hf/Llava-interleave-qwen-7b-hf |  7B  | TurboMind, PyTorch |
+|       llava-hf/llava-1.5-7b-hf       |  7B  | TurboMind, PyTorch |
+|   liuhaotian/llava-v1.6-vicuna-7b    |  7B  | TurboMind, PyTorch |
+|   liuhaotian/llava-v1.6-mistral-7b   |  7B  | TurboMind, PyTorch |
+
+接下来的章节将演示如何使用 LMDeploy 部署 LLaVA 模型，并以 [llava-hf/llava-interleave](https://huggingface.co/llava-hf/llava-interleave-qwen-7b-hf) 为例。
+
+## 安装
+
+请按照[安装指南](../get_started/installation.md)安装 LMDeploy。
+
+或者，您也可以使用官方的 Docker 镜像：
+
+```shell
+docker pull openmmlab/lmdeploy:latest
+```
+
+## 离线推理
+
+以下示例代码展示了 VLM pipeline 的基本用法。有关详细信息，请参考 [VLM 离线推理流程](./vl_pipeline.md)。
+
+```python
+from lmdeploy import GenerationConfig, TurbomindEngineConfig, pipeline
+from lmdeploy.vl import load_image
+
+pipe = pipeline("llava-hf/llava-interleave-qwen-7b-hf", backend_config=TurbomindEngineConfig(cache_max_entry_count=0.5),
+    gen_config=GenerationConfig(max_new_tokens=512))
+
+image = load_image('https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg')
+prompt = 'Describe the image.'
+print(f'prompt:{prompt}')
+response = pipe((prompt, image))
+print(response)
+```
+
+更多示例：
+
+<details>
+  <summary><b>多图片多轮对话，组合图片</b></summary>
+
+```python
+from lmdeploy import pipeline, GenerationConfig
+
+pipe = pipeline('llava-hf/llava-interleave-qwen-7b-hf', log_level='INFO')
+messages = [
+    dict(role='user', content=[
+        dict(type='text', text='Describe the two images in detail.'),
+        dict(type='image_url', image_url=dict(url='https://raw.githubusercontent.com/QwenLM/Qwen-VL/master/assets/mm_tutorial/Beijing_Small.jpeg')),
+        dict(type='image_url', image_url=dict(url='https://raw.githubusercontent.com/QwenLM/Qwen-VL/master/assets/mm_tutorial/Chongqing_Small.jpeg'))
+    ])
+]
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+
+messages.append(dict(role='assistant', content=out.text))
+messages.append(dict(role='user', content='What are the similarities and differences between these two images.'))
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+```
+
+</details>
+
+## 在线服务
+
+可以使用 `lmdeploy serve api_server` CLI 启动服务器：
+
+```shell
+lmdeploy serve api_server llava-hf/llava-interleave-qwen-7b-hf
+```
+
+或者，使用前面提到的 Docker 镜像启动服务：
+
+```shell
+docker run --runtime nvidia --gpus all \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
+    -p 23333:23333 \
+    --ipc=host \
+    openmmlab/lmdeploy:latest \
+    lmdeploy serve api_server llava-hf/llava-interleave-qwen-7b-hf
+```
+
+采用 Docker Compose 部署也是一种常见选择。在 lmdeploy 项目的根目录创建 `docker-compose.yml` 文件，如下：
+
+```yaml
+version: '3.5'
+
+services:
+  lmdeploy:
+    container_name: lmdeploy
+    image: openmmlab/lmdeploy:latest
+    ports:
+      - "23333:23333"
+    environment:
+      HUGGING_FACE_HUB_TOKEN: <secret>
+    volumes:
+      - ~/.cache/huggingface:/root/.cache/huggingface
+    stdin_open: true
+    tty: true
+    ipc: host
+    command: lmdeploy serve api_server llava-hf/llava-interleave-qwen-7b-hf
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: "all"
+              capabilities: [gpu]
+```
+
+然后，可以执行以下命令启动服务：
+
+```shell
+docker-compose up -d
+```
+
+当运行 `docker logs -f lmdeploy` 后看到如下日志，说明服务启动成功：
+
+```text
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+INFO:     Started server process [2439]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on  http://0.0.0.0:23333  (Press CTRL+C to quit)
+```
+
+可以通过 `lmdeploy serve api_server -h` 查看 `lmdeploy serve api_server` 的参数详情。
+
+关于 `api_server` 以及如何访问服务的更多信息可以在[这里](api_server_vl.md)找到。
diff --git a/lmdeploy/turbomind/generate_gemm_config.py b/lmdeploy/turbomind/generate_gemm_config.py
index a7689ebc27..34e769776f 100644
--- a/lmdeploy/turbomind/generate_gemm_config.py
+++ b/lmdeploy/turbomind/generate_gemm_config.py
@@ -54,19 +54,14 @@ def main(head_num: int = 32,
             from transformers import AutoConfig
             config = AutoConfig.from_pretrained(model_path,
                                                 trust_remote_code=True)
-            try:
-                head_num = config.num_attention_heads
-                size_per_head = config.hidden_size // head_num
-                inter_size = config.intermediate_size
-                vocab_size = config.vocab_size
-            except AttributeError:
-                for key in ['language_config', 'llm_config', 'text_config']:
-                    config = getattr(config, key, config)
-
-                head_num = config.num_attention_heads
-                size_per_head = config.hidden_size // head_num
-                inter_size = config.intermediate_size
-                vocab_size = config.vocab_size
+
+            for key in ['language_config', 'llm_config', 'text_config']:
+                config = getattr(config, key, config)
+            head_num = config.num_attention_heads
+            size_per_head = config.hidden_size // head_num
+            inter_size = config.intermediate_size
+            vocab_size = config.vocab_size
+
     for bsz in range(1, max_batch_size + 1):
         subprocess.call(
             f'{get_llama_gemm()} {bsz} 1 1 {head_num} {size_per_head}'
diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py
index 17f8edf22c..88ca22717d 100644
--- a/lmdeploy/turbomind/supported_models.py
+++ b/lmdeploy/turbomind/supported_models.py
@@ -98,6 +98,9 @@ def _is_head_dim_supported(cfg):
                 # internvl2-4b,internlm2-1b are not working yet
                 support_by_turbomind = _is_head_dim_supported(cfg.llm_config)
             elif arch == 'LlavaForConditionalGeneration':
-                support_by_turbomind = _is_head_dim_supported(cfg.text_config)
+                sub_arch = cfg.text_config.architectures[0]
+                if sub_arch in ['Qwen2ForCausalLM', 'LlamaForCausalLM']:
+                    support_by_turbomind = _is_head_dim_supported(
+                        cfg.text_config)
 
     return support_by_turbomind

From 48d1a5cc0c141ec03fa68875897eed1be0c3d694 Mon Sep 17 00:00:00 2001
From: Willow <deepindeed2022@gmail.com>
Date: Fri, 8 Nov 2024 03:13:40 +0000
Subject: [PATCH 7/8] keep LlavaLlamaForCausalLM/LlavaMistralForCausalLM to
 llama

---
 lmdeploy/turbomind/deploy/source_model/llava.py | 12 ++++++------
 lmdeploy/turbomind/supported_models.py          |  5 ++---
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/lmdeploy/turbomind/deploy/source_model/llava.py b/lmdeploy/turbomind/deploy/source_model/llava.py
index 0902468a77..7d0a1ff058 100644
--- a/lmdeploy/turbomind/deploy/source_model/llava.py
+++ b/lmdeploy/turbomind/deploy/source_model/llava.py
@@ -30,18 +30,18 @@ def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
         super().__init__(model_path, tokenizer_path, **kwargs)
         from transformers import AutoConfig
         config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+        config = getattr(config, 'text_config', config)
         arch = config.architectures[0]
-        _readers = dict(LlavaForConditionalGeneration=LlavaReader,
-                        LlavaMistralForCausalLM=LlamaReader,
-                        LlavaLlamaForCausalLM=LlamaReader)
+        _readers = dict(Qwen2ForCausalLM=LlavaReader,
+                        LlamaForCausalL=LlavaReader)
         self.Reader = _readers[arch]
         self.arch = arch
 
     def model_info(self):
-        if self.arch in ['LlavaMistralForCausalLM', 'LlavaLlamaForCausalLM']:
-            return super().model_info()
         """Read model info for LlavaForConditionalGeneration.
-        https://huggingface.co/llava-hf/llava-interleave-qwen-7b-hf"""
+
+        https://huggingface.co/llava-hf/llava-interleave-qwen-7b-hf
+        """
         params_path = osp.join(self.model_path, 'config.json')
         with open(params_path) as f:
             model_arg = json.load(f)['text_config']
diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py
index 88ca22717d..fe0819d70f 100644
--- a/lmdeploy/turbomind/supported_models.py
+++ b/lmdeploy/turbomind/supported_models.py
@@ -23,8 +23,8 @@
     # mistral
     MistralForCausalLM='llama',
     # llava
-    LlavaLlamaForCausalLM='llava',
-    LlavaMistralForCausalLM='llava',
+    LlavaLlamaForCausalLM='llama',
+    LlavaMistralForCausalLM='llama',
     LlavaForConditionalGeneration='llava',
     # xcomposer2
     InternLMXComposer2ForCausalLM='xcomposer2',
@@ -95,7 +95,6 @@ def _is_head_dim_supported(cfg):
                     # glm-4v-9b not supported
                     support_by_turbomind = False
             elif arch == 'InternVLChatModel':
-                # internvl2-4b,internlm2-1b are not working yet
                 support_by_turbomind = _is_head_dim_supported(cfg.llm_config)
             elif arch == 'LlavaForConditionalGeneration':
                 sub_arch = cfg.text_config.architectures[0]

From 4c55c8dc4d750ee7026b45ab6ff563e5138249e0 Mon Sep 17 00:00:00 2001
From: Willow <deepindeed2022@gmail.com>
Date: Fri, 8 Nov 2024 06:49:57 +0000
Subject: [PATCH 8/8] fix attn_bias default value

---
 lmdeploy/turbomind/deploy/source_model/llava.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/lmdeploy/turbomind/deploy/source_model/llava.py b/lmdeploy/turbomind/deploy/source_model/llava.py
index 7d0a1ff058..3b4d82c37b 100644
--- a/lmdeploy/turbomind/deploy/source_model/llava.py
+++ b/lmdeploy/turbomind/deploy/source_model/llava.py
@@ -33,7 +33,7 @@ def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
         config = getattr(config, 'text_config', config)
         arch = config.architectures[0]
         _readers = dict(Qwen2ForCausalLM=LlavaReader,
-                        LlamaForCausalL=LlavaReader)
+                        LlamaForCausalLM=LlavaReader)
         self.Reader = _readers[arch]
         self.arch = arch
 
@@ -63,7 +63,9 @@ def model_info(self):
             hidden_units = model_arg.get('hidden_size', 4096)
             vocab_size = model_arg.get('vocab_size', 152000)
             intermediate_size = model_arg.get('intermediate_size', 11008)
-            attn_bias = int(model_arg.get('attn_bias', 1))
+            attn_bias = 1 if model_arg['architectures'][0] \
+                == 'Qwen2ForCausalLM' else 0
+            attn_bias = int(model_arg.get('attn_bias', attn_bias))
             use_logn_attn = int(model_arg.get('use_logn_attn', 0))
 
             if isinstance(rope_scaling, dict):