From 78ab485e4a563e82f8d560e836ebeeec4a2c22b1 Mon Sep 17 00:00:00 2001
From: Willow <deepindeed2022@gmail.com>
Date: Fri, 8 Nov 2024 19:31:35 +0800
Subject: [PATCH] [Feature]: support LlavaForConditionalGeneration with
 turbomind inference (#2710)

* feat: support llava_qwen2 for fp16 and awq

* update generate gemm config script for VLM

* lint: fix lint warning

* doc: presenting the usage in the user guide

* resolve conflict issue and refactor for better design

* fix and doc:
- fix tune attribute error
- add chinese llava doc

* keep LlavaLlamaForCausalLM/LlavaMistralForCausalLM to llama

* fix attn_bias default value
---
 docs/en/multi_modal/llava.md                  | 138 +++++++++++++++++-
 docs/zh_cn/multi_modal/llava.md               | 134 ++++++++++++++++-
 .../turbomind/deploy/source_model/__init__.py |   1 +
 .../turbomind/deploy/source_model/llava.py    |  89 +++++++++++
 lmdeploy/turbomind/generate_gemm_config.py    |   4 +
 lmdeploy/turbomind/supported_models.py        |   6 +
 6 files changed, 370 insertions(+), 2 deletions(-)
 create mode 100644 lmdeploy/turbomind/deploy/source_model/llava.py
diff --git a/docs/en/multi_modal/llava.md b/docs/en/multi_modal/llava.md
index cf95e15d5c..8f052227d5 100644
--- a/docs/en/multi_modal/llava.md
+++ b/docs/en/multi_modal/llava.md
@@ -1,3 +1,139 @@
 # LLaVA
 
-TODO
+LMDeploy supports the following llava series of models, which are detailed in the table below:
+
+|                Model                 | Size | Supported Inference Engine |
+| :----------------------------------: | :--: | :------------------------: |
+| llava-hf/Llava-interleave-qwen-7b-hf |  7B  |     TurboMind, PyTorch     |
+|       llava-hf/llava-1.5-7b-hf       |  7B  |     TurboMind, PyTorch     |
+|   liuhaotian/llava-v1.6-vicuna-7b    |  7B  |     TurboMind, PyTorch     |
+|   liuhaotian/llava-v1.6-mistral-7b   |  7B  |     TurboMind, PyTorch     |
+
+The next chapter demonstrates how to deploy an Llava model using LMDeploy, with [llava-hf/llava-interleave](https://huggingface.co/llava-hf/llava-interleave-qwen-7b-hf) as an example.
+
+## Installation
+
+Please install LMDeploy by following the [installation guide](../get_started/installation.md).
+
+Or, you can go with office docker image:
+
+```shell
+docker pull openmmlab/lmdeploy:latest
+```
+
+## Offline inference
+
+The following sample code shows the basic usage of VLM pipeline. For detailed information, please refer to [VLM Offline Inference Pipeline](./vl_pipeline.md)
+
+```python
+from lmdeploy import GenerationConfig, TurbomindEngineConfig, pipeline
+from lmdeploy.vl import load_image
+
+
+pipe = pipeline("llava-hf/llava-interleave-qwen-7b-hf", backend_config=TurbomindEngineConfig(cache_max_entry_count=0.5),
+    gen_config=GenerationConfig(max_new_tokens=512))
+
+image = load_image('https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg')
+prompt = 'Describe the image.'
+print(f'prompt:{prompt}')
+response = pipe((prompt, image))
+print(response)
+
+```
+
+More examples are listed below:
+
+<details>
+  <summary>
+    <b>multi-image multi-round conversation, combined images</b>
+  </summary>
+
+```python
+from lmdeploy import pipeline, GenerationConfig
+
+pipe = pipeline('llava-hf/llava-interleave-qwen-7b-hf', log_level='INFO')
+messages = [
+    dict(role='user', content=[
+        dict(type='text', text='Describe the two images in detail.'),
+        dict(type='image_url', image_url=dict(url='https://raw.githubusercontent.com/QwenLM/Qwen-VL/master/assets/mm_tutorial/Beijing_Small.jpeg')),
+        dict(type='image_url', image_url=dict(url='https://raw.githubusercontent.com/QwenLM/Qwen-VL/master/assets/mm_tutorial/Chongqing_Small.jpeg'))
+    ])
+]
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+
+messages.append(dict(role='assistant', content=out.text))
+messages.append(dict(role='user', content='What are the similarities and differences between these two images.'))
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+```
+
+</details>
+
+## Online serving
+
+You can launch the server by the `lmdeploy serve api_server` CLI:
+
+```shell
+lmdeploy serve api_server llava-hf/llava-interleave-qwen-7b-hf
+```
+
+You can also start the service using the aforementioned built docker image:
+
+```shell
+docker run --runtime nvidia --gpus all \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
+    -p 23333:23333 \
+    --ipc=host \
+    openmmlab/lmdeploy:latest \
+    lmdeploy serve api_server llava-hf/llava-interleave-qwen-7b-hf
+```
+
+The docker compose is another option. Create a `docker-compose.yml` configuration file in the root directory of the lmdeploy project as follows:
+
+```yaml
+version: '3.5'
+
+services:
+  lmdeploy:
+    container_name: lmdeploy
+    image: openmmlab/lmdeploy:latest
+    ports:
+      - "23333:23333"
+    environment:
+      HUGGING_FACE_HUB_TOKEN: <secret>
+    volumes:
+      - ~/.cache/huggingface:/root/.cache/huggingface
+    stdin_open: true
+    tty: true
+    ipc: host
+    command: lmdeploy serve api_server llava-hf/llava-interleave-qwen-7b-hf
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: "all"
+              capabilities: [gpu]
+```
+
+Then, you can execute the startup command as below:
+
+```shell
+docker-compose up -d
+```
+
+If you find the following logs after running `docker logs -f lmdeploy`, it means the service launches successfully.
+
+```text
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+INFO:     Started server process [2439]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on  http://0.0.0.0:23333  (Press CTRL+C to quit)
+```
+
+The arguments of `lmdeploy serve api_server` can be reviewed in detail by `lmdeploy serve api_server -h`.
+
+More information about `api_server` as well as how to access the service can be found from [here](api_server_vl.md)
diff --git a/docs/zh_cn/multi_modal/llava.md b/docs/zh_cn/multi_modal/llava.md
index cf95e15d5c..c40f37308a 100644
--- a/docs/zh_cn/multi_modal/llava.md
+++ b/docs/zh_cn/multi_modal/llava.md
@@ -1,3 +1,135 @@
 # LLaVA
 
-TODO
+LMDeploy 支持以下 LLaVA 系列模型，具体如下表所示：
+
+|                 模型                 | 大小 |   支持的推理引擎   |
+| :----------------------------------: | :--: | :----------------: |
+| llava-hf/Llava-interleave-qwen-7b-hf |  7B  | TurboMind, PyTorch |
+|       llava-hf/llava-1.5-7b-hf       |  7B  | TurboMind, PyTorch |
+|   liuhaotian/llava-v1.6-vicuna-7b    |  7B  | TurboMind, PyTorch |
+|   liuhaotian/llava-v1.6-mistral-7b   |  7B  | TurboMind, PyTorch |
+
+接下来的章节将演示如何使用 LMDeploy 部署 LLaVA 模型，并以 [llava-hf/llava-interleave](https://huggingface.co/llava-hf/llava-interleave-qwen-7b-hf) 为例。
+
+## 安装
+
+请按照[安装指南](../get_started/installation.md)安装 LMDeploy。
+
+或者，您也可以使用官方的 Docker 镜像：
+
+```shell
+docker pull openmmlab/lmdeploy:latest
+```
+
+## 离线推理
+
+以下示例代码展示了 VLM pipeline 的基本用法。有关详细信息，请参考 [VLM 离线推理流程](./vl_pipeline.md)。
+
+```python
+from lmdeploy import GenerationConfig, TurbomindEngineConfig, pipeline
+from lmdeploy.vl import load_image
+
+pipe = pipeline("llava-hf/llava-interleave-qwen-7b-hf", backend_config=TurbomindEngineConfig(cache_max_entry_count=0.5),
+    gen_config=GenerationConfig(max_new_tokens=512))
+
+image = load_image('https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg')
+prompt = 'Describe the image.'
+print(f'prompt:{prompt}')
+response = pipe((prompt, image))
+print(response)
+```
+
+更多示例：
+
+<details>
+  <summary><b>多图片多轮对话，组合图片</b></summary>
+
+```python
+from lmdeploy import pipeline, GenerationConfig
+
+pipe = pipeline('llava-hf/llava-interleave-qwen-7b-hf', log_level='INFO')
+messages = [
+    dict(role='user', content=[
+        dict(type='text', text='Describe the two images in detail.'),
+        dict(type='image_url', image_url=dict(url='https://raw.githubusercontent.com/QwenLM/Qwen-VL/master/assets/mm_tutorial/Beijing_Small.jpeg')),
+        dict(type='image_url', image_url=dict(url='https://raw.githubusercontent.com/QwenLM/Qwen-VL/master/assets/mm_tutorial/Chongqing_Small.jpeg'))
+    ])
+]
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+
+messages.append(dict(role='assistant', content=out.text))
+messages.append(dict(role='user', content='What are the similarities and differences between these two images.'))
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+```
+
+</details>
+
+## 在线服务
+
+可以使用 `lmdeploy serve api_server` CLI 启动服务器：
+
+```shell
+lmdeploy serve api_server llava-hf/llava-interleave-qwen-7b-hf
+```
+
+或者，使用前面提到的 Docker 镜像启动服务：
+
+```shell
+docker run --runtime nvidia --gpus all \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
+    -p 23333:23333 \
+    --ipc=host \
+    openmmlab/lmdeploy:latest \
+    lmdeploy serve api_server llava-hf/llava-interleave-qwen-7b-hf
+```
+
+采用 Docker Compose 部署也是一种常见选择。在 lmdeploy 项目的根目录创建 `docker-compose.yml` 文件，如下：
+
+```yaml
+version: '3.5'
+
+services:
+  lmdeploy:
+    container_name: lmdeploy
+    image: openmmlab/lmdeploy:latest
+    ports:
+      - "23333:23333"
+    environment:
+      HUGGING_FACE_HUB_TOKEN: <secret>
+    volumes:
+      - ~/.cache/huggingface:/root/.cache/huggingface
+    stdin_open: true
+    tty: true
+    ipc: host
+    command: lmdeploy serve api_server llava-hf/llava-interleave-qwen-7b-hf
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: "all"
+              capabilities: [gpu]
+```
+
+然后，可以执行以下命令启动服务：
+
+```shell
+docker-compose up -d
+```
+
+当运行 `docker logs -f lmdeploy` 后看到如下日志，说明服务启动成功：
+
+```text
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+INFO:     Started server process [2439]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on  http://0.0.0.0:23333  (Press CTRL+C to quit)
+```
+
+可以通过 `lmdeploy serve api_server -h` 查看 `lmdeploy serve api_server` 的参数详情。
+
+关于 `api_server` 以及如何访问服务的更多信息可以在[这里](api_server_vl.md)找到。
diff --git a/lmdeploy/turbomind/deploy/source_model/__init__.py b/lmdeploy/turbomind/deploy/source_model/__init__.py
index a36102e1c6..b1da698e2e 100644
--- a/lmdeploy/turbomind/deploy/source_model/__init__.py
+++ b/lmdeploy/turbomind/deploy/source_model/__init__.py
@@ -5,6 +5,7 @@
 from .internlm2 import InternLM2Model  # noqa: F401
 from .internvl import InternVLModel  # noqa: F401
 from .llama import LlamaModel  # noqa: F401
+from .llava import LlavaModel  # noqa: F401
 from .meta_llama import MetaLlamaModel  # noqa: F401
 from .minicpmv import MiniCPMVModel  # noqa: F401
 from .mixtral import MixtralModel  # noqa: F401
diff --git a/lmdeploy/turbomind/deploy/source_model/llava.py b/lmdeploy/turbomind/deploy/source_model/llava.py
new file mode 100644
index 0000000000..3b4d82c37b
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/source_model/llava.py
@@ -0,0 +1,89 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+
+from .base import INPUT_MODELS
+from .llama import LlamaModel, LlamaReader
+
+
+class LlavaReader(LlamaReader):
+    """LlavaReader for llama model."""
+
+    attn_layer_prefix = 'language_model.model.layers'
+    attn_layer_patten = r'language_model.model.layers.([0-9]+).'
+    tok_embeddings_key = 'language_model.model.embed_tokens.weight'
+    norm_weight_key = 'language_model.model.norm.weight'
+    output_weight_key = 'language_model.lm_head.weight'
+
+    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
+                 model_cfg: dict, policy):
+        model_cfg = model_cfg.get('text_config')
+        super().__init__(new_params, unused_params, last_bin, model_cfg,
+                         policy)
+
+
+@INPUT_MODELS.register_module(name='llava')
+class LlavaModel(LlamaModel):
+    """LlavaModel model in hf format."""
+
+    def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
+        super().__init__(model_path, tokenizer_path, **kwargs)
+        from transformers import AutoConfig
+        config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+        config = getattr(config, 'text_config', config)
+        arch = config.architectures[0]
+        _readers = dict(Qwen2ForCausalLM=LlavaReader,
+                        LlamaForCausalLM=LlavaReader)
+        self.Reader = _readers[arch]
+        self.arch = arch
+
+    def model_info(self):
+        """Read model info for LlavaForConditionalGeneration.
+
+        https://huggingface.co/llava-hf/llava-interleave-qwen-7b-hf
+        """
+        params_path = osp.join(self.model_path, 'config.json')
+        with open(params_path) as f:
+            model_arg = json.load(f)['text_config']
+            num_layer = model_arg.get('num_hidden_layers', 32)
+            norm_eps = model_arg.get('rms_norm_eps', 1e-6)
+            attn_head_num = model_arg.get('num_attention_heads', 32)
+            if 'num_key_value_heads' in model_arg:
+                kv_head_num = model_arg.get('num_key_value_heads', 32)
+            else:
+                kv_head_num = model_arg.get('num_attention_heads', 32)
+            rope_theta = float(model_arg.get('rope_theta', 10000.0))
+            max_position_embeddings = int(
+                model_arg.get('max_position_embeddings', 0))
+            rope_scaling = model_arg.get('rope_scaling', None)
+            scaling_factor = 0.0
+            use_dynamic_ntk = 0
+
+            # special for the model: llava-hf/llava-interleave-qwen-7b-hf
+            hidden_units = model_arg.get('hidden_size', 4096)
+            vocab_size = model_arg.get('vocab_size', 152000)
+            intermediate_size = model_arg.get('intermediate_size', 11008)
+            attn_bias = 1 if model_arg['architectures'][0] \
+                == 'Qwen2ForCausalLM' else 0
+            attn_bias = int(model_arg.get('attn_bias', attn_bias))
+            use_logn_attn = int(model_arg.get('use_logn_attn', 0))
+
+            if isinstance(rope_scaling, dict):
+                scaling_type = model_arg['rope_scaling'].get('type', '')
+                scaling_factor = model_arg['rope_scaling'].get('factor', '')
+                if scaling_type == 'dynamic':
+                    use_dynamic_ntk = 1
+
+        return dict(num_layer=num_layer,
+                    norm_eps=norm_eps,
+                    head_num=attn_head_num,
+                    hidden_units=hidden_units,
+                    kv_head_num=kv_head_num,
+                    rope_theta=rope_theta,
+                    max_position_embeddings=max_position_embeddings,
+                    use_dynamic_ntk=use_dynamic_ntk,
+                    rope_scaling_factor=scaling_factor,
+                    inter_size=intermediate_size,
+                    use_logn_attn=use_logn_attn,
+                    attn_bias=attn_bias,
+                    vocab_size=vocab_size)
diff --git a/lmdeploy/turbomind/generate_gemm_config.py b/lmdeploy/turbomind/generate_gemm_config.py
index 91b057d723..34e769776f 100644
--- a/lmdeploy/turbomind/generate_gemm_config.py
+++ b/lmdeploy/turbomind/generate_gemm_config.py
@@ -54,10 +54,14 @@ def main(head_num: int = 32,
             from transformers import AutoConfig
             config = AutoConfig.from_pretrained(model_path,
                                                 trust_remote_code=True)
+
+            for key in ['language_config', 'llm_config', 'text_config']:
+                config = getattr(config, key, config)
             head_num = config.num_attention_heads
             size_per_head = config.hidden_size // head_num
             inter_size = config.intermediate_size
             vocab_size = config.vocab_size
+
     for bsz in range(1, max_batch_size + 1):
         subprocess.call(
             f'{get_llama_gemm()} {bsz} 1 1 {head_num} {size_per_head}'
diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py
index 979ed0c547..fe0819d70f 100644
--- a/lmdeploy/turbomind/supported_models.py
+++ b/lmdeploy/turbomind/supported_models.py
@@ -25,6 +25,7 @@
     # llava
     LlavaLlamaForCausalLM='llama',
     LlavaMistralForCausalLM='llama',
+    LlavaForConditionalGeneration='llava',
     # xcomposer2
     InternLMXComposer2ForCausalLM='xcomposer2',
     # internvl
@@ -95,5 +96,10 @@ def _is_head_dim_supported(cfg):
                     support_by_turbomind = False
             elif arch == 'InternVLChatModel':
                 support_by_turbomind = _is_head_dim_supported(cfg.llm_config)
+            elif arch == 'LlavaForConditionalGeneration':
+                sub_arch = cfg.text_config.architectures[0]
+                if sub_arch in ['Qwen2ForCausalLM', 'LlamaForCausalLM']:
+                    support_by_turbomind = _is_head_dim_supported(
+                        cfg.text_config)
 
     return support_by_turbomind