From 4a8d745604246ae6fdd1978688c28cf7ffda2a03 Mon Sep 17 00:00:00 2001
From: q yao <streetyao@live.com>
Date: Mon, 11 Nov 2024 21:11:30 +0800
Subject: [PATCH 01/40] Support ep, column major moe kernel. (#2690)

* support EP, optimize moe kernel

* support ep and col major moe kernel

* remove create_weight_ep
---
 lmdeploy/pytorch/backends/cuda/moe.py      | 43 ++++++++++--
 lmdeploy/pytorch/backends/dlinfer/moe.py   | 14 ++--
 lmdeploy/pytorch/backends/moe.py           | 21 ++++--
 lmdeploy/pytorch/kernels/cuda/fused_moe.py | 79 ++++++++--------------
 lmdeploy/pytorch/nn/moe.py                 | 79 +++++++++++++++++-----
 5 files changed, 153 insertions(+), 83 deletions(-)

diff --git a/lmdeploy/pytorch/backends/cuda/moe.py b/lmdeploy/pytorch/backends/cuda/moe.py
index e5ae92d8bd..eb38401211 100644
--- a/lmdeploy/pytorch/backends/cuda/moe.py
+++ b/lmdeploy/pytorch/backends/cuda/moe.py
@@ -1,5 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
+from typing import List
+
 import torch
 
 from lmdeploy.pytorch.kernels.cuda import fused_moe
@@ -10,7 +12,11 @@
 class TritonFusedMoEImpl(FusedMoEImpl):
     """triton fused moe implementation."""
 
-    def __init__(self, top_k: int, renormalize: bool = False):
+    def __init__(self,
+                 top_k: int,
+                 num_experts: int,
+                 renormalize: bool = False):
+        self.num_experts = num_experts
         self.top_k = top_k
         self.renormalize = renormalize
 
@@ -23,16 +29,39 @@ def update_weights(self, gate_up_weights: torch.Tensor,
                                               2).contiguous().transpose(1, 2)
         return gate_up_weights, down_weights
 
-    def forward(self, hidden_states: torch.Tensor, topk_weights: torch.Tensor,
-                topk_ids: torch.LongTensor, gate_up_weights: torch.Tensor,
-                down_weights: torch.Tensor):
+    def support_ep(self):
+        """support expert parallelism."""
+        return True
+
+    def ep_expert_list(self, world_size: int, rank: int):
+        """experts list of current rank."""
+        num_experts = self.num_experts
+        expert_per_rank = (num_experts + world_size - 1) // world_size
+        first_expert = rank * expert_per_rank
+        last_expert = min(first_expert + expert_per_rank, num_experts)
+        return list(range(first_expert, last_expert))
+
+    def forward(self,
+                hidden_states: torch.Tensor,
+                topk_weights: torch.Tensor,
+                topk_ids: torch.LongTensor,
+                gate_up_weights: torch.Tensor,
+                down_weights: torch.Tensor,
+                expert_list: List[int] = None):
         """forward."""
+        expert_offset = 0
+        num_experts = None
+        if expert_list is not None and len(expert_list) != self.num_experts:
+            expert_offset = expert_list[0]
+            num_experts = self.num_experts
         return fused_moe(hidden_states,
                          gate_up_weights,
                          down_weights,
                          topk_weights=topk_weights,
                          topk_ids=topk_ids,
                          topk=self.top_k,
+                         expert_offset=expert_offset,
+                         num_experts=num_experts,
                          renormalize=self.renormalize)
 
 
@@ -40,6 +69,8 @@ class TritonFusedMoEBuilder(FusedMoEBuilder):
     """triton fused moe builder."""
 
     @staticmethod
-    def build(top_k: int, renormalize: bool = False):
+    def build(top_k: int, num_experts: int, renormalize: bool = False):
         """build from mlp."""
-        return TritonFusedMoEImpl(top_k=top_k, renormalize=renormalize)
+        return TritonFusedMoEImpl(top_k=top_k,
+                                  num_experts=num_experts,
+                                  renormalize=renormalize)
diff --git a/lmdeploy/pytorch/backends/dlinfer/moe.py b/lmdeploy/pytorch/backends/dlinfer/moe.py
index eb8b1e591e..90f6335ecb 100644
--- a/lmdeploy/pytorch/backends/dlinfer/moe.py
+++ b/lmdeploy/pytorch/backends/dlinfer/moe.py
@@ -1,5 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
+from typing import List
+
 import torch
 
 from lmdeploy.pytorch.kernels.dlinfer import fused_moe, moe_gating_topk_softmax
@@ -38,9 +40,13 @@ def __init__(self, top_k: int, renormalize: bool = False):
         self.top_k = top_k
         self.renormalize = renormalize
 
-    def forward(self, hidden_states: torch.Tensor, topk_weights: torch.Tensor,
-                topk_ids: torch.LongTensor, gate_up_weights: torch.Tensor,
-                down_weights: torch.Tensor):
+    def forward(self,
+                hidden_states: torch.Tensor,
+                topk_weights: torch.Tensor,
+                topk_ids: torch.LongTensor,
+                gate_up_weights: torch.Tensor,
+                down_weights: torch.Tensor,
+                expert_list: List[int] = None):
         """forward."""
         return fused_moe(hidden_states, self.top_k, topk_ids, topk_weights,
                          gate_up_weights, down_weights)
@@ -50,6 +56,6 @@ class DlinferFusedMoEBuilder(FusedMoEBuilder):
     """dlinfer fused moe builder."""
 
     @staticmethod
-    def build(top_k: int, renormalize: bool = False):
+    def build(top_k: int, num_experts: int, renormalize: bool = False):
         """build from mlp."""
         return DlinferFusedMoEImpl(top_k=top_k, renormalize=renormalize)
diff --git a/lmdeploy/pytorch/backends/moe.py b/lmdeploy/pytorch/backends/moe.py
index 4a1d5b73da..8e7977625e 100644
--- a/lmdeploy/pytorch/backends/moe.py
+++ b/lmdeploy/pytorch/backends/moe.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import ABC, abstractmethod
+from typing import List
 
 import torch
 
@@ -31,10 +32,22 @@ def update_weights(self, gate_up_weights: torch.Tensor,
         """update weights."""
         return gate_up_weights, down_weights
 
+    def support_ep(self):
+        """support expert parallelism."""
+        return False
+
+    def ep_expert_list(self, world_size: int, rank: int):
+        """experts list of current rank."""
+        raise NotImplementedError('Not Implemented.')
+
     @abstractmethod
-    def forward(self, hidden_states: torch.Tensor, topk_weights: torch.Tensor,
-                topk_ids: torch.LongTensor, gate_up_weights: torch.Tensor,
-                down_weights: torch.Tensor):
+    def forward(self,
+                hidden_states: torch.Tensor,
+                topk_weights: torch.Tensor,
+                topk_ids: torch.LongTensor,
+                gate_up_weights: torch.Tensor,
+                down_weights: torch.Tensor,
+                expert_list: List[int] = None):
         """forward."""
         raise NotImplementedError
 
@@ -44,6 +57,6 @@ class FusedMoEBuilder(ABC):
 
     @staticmethod
     @abstractmethod
-    def build(top_k: int, renormalize: bool = False):
+    def build(top_k: int, num_experts: int, renormalize: bool = False):
         """build from mlp."""
         raise NotImplementedError
diff --git a/lmdeploy/pytorch/kernels/cuda/fused_moe.py b/lmdeploy/pytorch/kernels/cuda/fused_moe.py
index e9ac7087cd..9f9771368e 100644
--- a/lmdeploy/pytorch/kernels/cuda/fused_moe.py
+++ b/lmdeploy/pytorch/kernels/cuda/fused_moe.py
@@ -5,7 +5,7 @@
 import triton.language as tl
 
 from .activation import silu_and_mul
-from .triton_utils import get_kernel_meta, wrap_jit_func
+from .triton_utils import get_kernel_meta
 
 
 def get_cuda_autotune_config():
@@ -13,16 +13,16 @@ def get_cuda_autotune_config():
         triton.Config(
             {
                 'BLOCK_SIZE_M': 128,
-                'BLOCK_SIZE_N': 256,
-                'BLOCK_SIZE_K': 64,
+                'BLOCK_SIZE_N': 128,
+                'BLOCK_SIZE_K': 32,
                 'GROUP_SIZE_M': 1,
             },
-            num_stages=3,
-            num_warps=8),
+            num_stages=4,
+            num_warps=4),
         triton.Config(
             {
-                'BLOCK_SIZE_M': 128,
-                'BLOCK_SIZE_N': 128,
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 256,
                 'BLOCK_SIZE_K': 32,
                 'GROUP_SIZE_M': 1,
             },
@@ -43,34 +43,9 @@ def get_cuda_autotune_config():
 @triton.autotune(
     configs=get_cuda_autotune_config(),
     key=['N', 'K', 'M_NP2'],
+    warmup=10,
+    rep=25,
 )
-@wrap_jit_func(type_hint=dict(
-    A=torch.Tensor,
-    B=torch.Tensor,
-    C=torch.Tensor,
-    SortedIdx=torch.Tensor,
-    ExpStart=torch.Tensor,
-    ExpEnd=torch.Tensor,
-    Weights=torch.Tensor,
-    N=int,
-    K=int,
-    stride_am=int,
-    stride_ak=int,
-    stride_be=int,
-    stride_bn=int,
-    stride_bk=int,
-    stride_cm=int,
-    stride_cn=int,
-    BLOCK_SIZE_M=torch.int32,
-    BLOCK_SIZE_N=torch.int32,
-    BLOCK_SIZE_K=torch.int32,
-    GROUP_SIZE_M=torch.int32,
-    ENABLE_WEIGHTS=bool,
-    top_k=torch.int32,
-    expert_offset=torch.int32,
-    reindex_a=bool,
-    reindex_c=bool,
-))
 @triton.jit
 def fused_moe_kernel(
     A,
@@ -110,16 +85,23 @@ def fused_moe_kernel(
     if M <= 0:
         return
 
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_m = tl.cdiv(M_NP2, BLOCK_SIZE_M)
     num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-
-    if pid_m * BLOCK_SIZE_M >= M:
+
+    if GROUP_SIZE_M == 1:
+        pid_m = pid % num_pid_m
+        pid_n = pid // num_pid_m
+        # pid_m = pid // num_pid_n
+        # pid_n = pid % num_pid_n
+    else:
+        num_pid_in_group = GROUP_SIZE_M * num_pid_n
+        group_id = pid // num_pid_in_group
+        first_pid_m = group_id * GROUP_SIZE_M
+        group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+        pid_m = first_pid_m + (pid % group_size_m)
+        pid_n = (pid % num_pid_in_group) // group_size_m
+
+    if pid_m * BLOCK_SIZE_M >= M or pid_n * BLOCK_SIZE_N >= N:
         return
 
     offs_sid = exp_start + pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
@@ -189,11 +171,11 @@ def fused_moe_kernel_launcher(
     if num_tokens is None:
         num_tokens = A.size(0)
     M_NP2 = triton.next_power_of_2(num_tokens)
-    M_NP2 = max(32, M_NP2)
+    M_NP2 = max(64, M_NP2)
     E, N, K = B.shape
 
     def _grid_fn(META):
-        grid = (triton.cdiv(num_tokens, META['BLOCK_SIZE_M']) *
+        grid = (triton.cdiv(M_NP2, META['BLOCK_SIZE_M']) *
                 triton.cdiv(N, META['BLOCK_SIZE_N']), E)
         return grid
 
@@ -229,13 +211,6 @@ def _grid_fn(META):
     )
 
 
-@wrap_jit_func(type_hint=dict(TopkIdx=torch.Tensor,
-                              SortedIdx=torch.Tensor,
-                              ExpStart=torch.Tensor,
-                              ExpEnd=torch.Tensor,
-                              len_sorted_idx=int,
-                              num_experts=torch.int32,
-                              BLOCK=torch.int32))
 @triton.jit
 def _start_end_kernel(TopkIdx, SortedIdx, ExpStart, ExpEnd,
                       len_sorted_idx: int, num_experts: tl.constexpr,
diff --git a/lmdeploy/pytorch/nn/moe.py b/lmdeploy/pytorch/nn/moe.py
index 6467a6de08..47176335c4 100644
--- a/lmdeploy/pytorch/nn/moe.py
+++ b/lmdeploy/pytorch/nn/moe.py
@@ -35,32 +35,54 @@ def __init__(self,
                  renormalize: bool = False,
                  dtype: Optional[torch.dtype] = None,
                  device: Optional[torch.device] = None,
-                 all_reduce: bool = True):
+                 all_reduce: bool = True,
+                 enable_ep: bool = False):
         super().__init__()
         if device is None:
             device = torch.device('cpu')
         if dtype is None:
             dtype = torch.float16
-        hidden_dim, ffn_dim = self._update_args(hidden_dim, ffn_dim)
 
         impl_builder = get_backend().get_layer_impl_builder(OpType.FusedMoE)
-        self.impl = impl_builder.build(top_k, renormalize)
-
-        gate_up_weights, down_weights = self.create_weights(hidden_dim,
-                                                            ffn_dim,
-                                                            num_experts,
-                                                            dtype=dtype,
-                                                            device=device)
+        self.impl = impl_builder.build(top_k, num_experts, renormalize)
+
+        self.expert_list = None
+        self.expert_map = None
+        enable_ep = enable_ep and self.impl.support_ep()
+        if enable_ep:
+            world_size, rank = get_world_rank()
+            expert_list = self.impl.ep_expert_list(world_size, rank)
+            self.expert_list = expert_list
+            self.expert_map = dict(
+                (eid, idx) for idx, eid in enumerate(expert_list))
+            num_experts = len(expert_list)
+            gate_up_weights, down_weights = self.create_weights(hidden_dim,
+                                                                ffn_dim,
+                                                                num_experts,
+                                                                dtype=dtype,
+                                                                device=device)
+        else:
+            hidden_dim, ffn_dim = self._update_args(hidden_dim, ffn_dim)
+            gate_up_weights, down_weights = self.create_weights(hidden_dim,
+                                                                ffn_dim,
+                                                                num_experts,
+                                                                dtype=dtype,
+                                                                device=device)
         gate_up_weights = torch.nn.Parameter(gate_up_weights,
                                              requires_grad=False)
         down_weights = torch.nn.Parameter(down_weights, requires_grad=False)
-        gate_up_weights.weight_loader = self.weight_loader
-        down_weights.weight_loader = self.weight_loader
         gate_up_weights._weight_type = 'gate_up_weights'
         down_weights._weight_type = 'down_weights'
         self.register_parameter('gate_up_weights', gate_up_weights)
         self.register_parameter('down_weights', down_weights)
 
+        if enable_ep:
+            gate_up_weights.weight_loader = self.weight_loader_ep
+            down_weights.weight_loader = self.weight_loader_ep
+        else:
+            gate_up_weights.weight_loader = self.weight_loader_tp
+            down_weights.weight_loader = self.weight_loader_tp
+
         self.hidden_dim = hidden_dim
         self.ffn_dim = ffn_dim
         self.num_experts = num_experts
@@ -91,21 +113,23 @@ def create_weights(self, hidden_dim: int, ffn_dim: int, num_experts: int,
 
     def update_weights(self):
         """update weights."""
+        gateup_loader = self.gate_up_weights.weight_loader
+        down_loader = self.down_weights.weight_loader
         gate_up_weights, down_weights = self.impl.update_weights(
             self.gate_up_weights, self.down_weights)
         gate_up_weights = torch.nn.Parameter(gate_up_weights,
                                              requires_grad=False)
         down_weights = torch.nn.Parameter(down_weights, requires_grad=False)
-        gate_up_weights.weight_loader = self.weight_loader
-        down_weights.weight_loader = self.weight_loader
+        gate_up_weights.weight_loader = gateup_loader
+        down_weights.weight_loader = down_loader
         gate_up_weights._weight_type = 'gate_up_weights'
         down_weights._weight_type = 'down_weights'
         self.register_parameter('gate_up_weights', gate_up_weights)
         self.register_parameter('down_weights', down_weights)
 
-    def weight_loader(self, param: torch.nn.Parameter,
-                      loaded_weight: torch.Tensor, expert_id: int,
-                      shard_id: str):
+    def weight_loader_tp(self, param: torch.nn.Parameter,
+                         loaded_weight: torch.Tensor, expert_id: int,
+                         shard_id: str):
         """weight loader."""
         world_size, rank = get_world_rank()
         if shard_id == 'gate':
@@ -121,10 +145,31 @@ def weight_loader(self, param: torch.nn.Parameter,
             raise RuntimeError(f'Unknown shard_id: {shard_id}')
         param_data.copy_(weight)
 
+    def weight_loader_ep(self, param: torch.nn.Parameter,
+                         loaded_weight: torch.Tensor, expert_id: int,
+                         shard_id: str):
+        """weight loader."""
+        expert_list = self.expert_list
+        if expert_id not in expert_list:
+            return
+
+        expert_map = self.expert_map
+        param_id = expert_map[expert_id]
+        if shard_id == 'gate':
+            param_data = param.data[param_id, :self.ffn_dim]
+        elif shard_id == 'up':
+            param_data = param.data[param_id, self.ffn_dim:]
+        elif shard_id == 'down':
+            param_data = param.data[param_id]
+        else:
+            raise RuntimeError(f'Unknown shard_id: {shard_id}')
+        param_data.copy_(loaded_weight)
+
     def forward(self, hidden_states: torch.Tensor, topk_weights: torch.Tensor,
                 topk_ids: torch.LongTensor):
         ret = self.impl.forward(hidden_states, topk_weights, topk_ids,
-                                self.gate_up_weights, self.down_weights)
+                                self.gate_up_weights, self.down_weights,
+                                self.expert_list)
         if self.all_reduce:
             dist.all_reduce(ret)
         return ret

From 67a85384de625618ba93d92c04d5b2e3d10d6f8f Mon Sep 17 00:00:00 2001
From: AllentDan <41138331+AllentDan@users.noreply.github.com>
Date: Tue, 12 Nov 2024 16:40:17 +0800
Subject: [PATCH 02/40] Remove one of the duplicate bos tokens (#2708)

* Remove one of the duplicate bos tokens

* Update tokenizer.py
---
 lmdeploy/tokenizer.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/lmdeploy/tokenizer.py b/lmdeploy/tokenizer.py
index e977005588..fb4364602a 100644
--- a/lmdeploy/tokenizer.py
+++ b/lmdeploy/tokenizer.py
@@ -624,7 +624,14 @@ def encode(self,
         Returns:
             list[int]: token ids
         """
-        return self.model.encode(s, add_bos, add_special_tokens, **kwargs)
+        encoded = self.model.encode(s, add_bos, add_special_tokens, **kwargs)
+        if encoded[:2] == [self.bos_token_id] * 2:
+            get_logger('lmdeploy').warn(
+                f'Detected duplicate bos token {self.bos_token_id} in prompt, '
+                'this will likely reduce response quality, one of them will be'
+                'removed')
+            encoded = encoded[1:]
+        return encoded
 
     def decode(
         self,

From e7517080903b9a5b2086818f800a1b1b212b4e3a Mon Sep 17 00:00:00 2001
From: vinkle <vinkle-hzt@outlook.com>
Date: Tue, 12 Nov 2024 21:15:09 +0800
Subject: [PATCH 03/40] fix assert pad >= 0 failed when inter_size is not a
 multiple of group_size (#2740)

---
 lmdeploy/turbomind/deploy/target_model/base.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lmdeploy/turbomind/deploy/target_model/base.py b/lmdeploy/turbomind/deploy/target_model/base.py
index 4750cde850..abd570cd00 100644
--- a/lmdeploy/turbomind/deploy/target_model/base.py
+++ b/lmdeploy/turbomind/deploy/target_model/base.py
@@ -38,7 +38,8 @@ def _weight_dtype_map(weight_type: str, default=None):
 
 def _pad_inter_size(inter_size: int, group_size: int, tp: int):
     group_size = max(1, group_size)
-    groups_per_rank = (inter_size // group_size + tp - 1) // tp
+    group_num = (inter_size + group_size - 1) // group_size
+    groups_per_rank = (group_num + tp - 1) // tp
     inter_size_padded = groups_per_rank * group_size * tp
     return inter_size_padded
 

From d2d4209d148c09356492a04000a878270896178c Mon Sep 17 00:00:00 2001
From: Li Zhang <lzhang329@gmail.com>
Date: Wed, 13 Nov 2024 11:27:26 +0800
Subject: [PATCH 04/40] Support Qwen2-MoE models (#2723)

* add qwen2-moe

* eliminate `inter_size_` from ffn layer

* clean up

* fix lint

* clean up

* Update config.yaml

---------

Co-authored-by: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
---
 autotest/config.yaml                          |   2 +
 lmdeploy/turbomind/deploy/config.py           |   2 +
 lmdeploy/turbomind/deploy/module.py           |  21 +-
 .../turbomind/deploy/source_model/mixtral.py  |   2 +
 .../turbomind/deploy/source_model/qwen.py     |  61 +++
 lmdeploy/turbomind/supported_models.py        |   1 +
 src/turbomind/kernels/gemm/moe_utils_v2.cu    | 449 ++++++++++++++++--
 src/turbomind/kernels/gemm/moe_utils_v2.h     |   4 +-
 .../kernels/gemm/test/test_moe_utils.cu       | 103 ++--
 src/turbomind/kernels/gemm/test/testbed.h     |   2 +
 .../models/llama/LlamaDecoderLayerWeight.cc   |  52 +-
 src/turbomind/models/llama/LlamaDenseWeight.h |  29 +-
 src/turbomind/models/llama/LlamaFfnLayer.cc   |  47 +-
 src/turbomind/models/llama/LlamaFfnLayer.h    |  11 +-
 src/turbomind/models/llama/llama_params.h     |   8 +-
 src/turbomind/models/llama/moe_ffn_layer.cc   |  78 +--
 src/turbomind/models/llama/moe_ffn_layer.h    |   6 +-
 src/turbomind/models/llama/unified_decoder.cc |  21 +-
 .../triton_backend/llama/LlamaTritonModel.cc  |   2 +
 19 files changed, 711 insertions(+), 190 deletions(-)

diff --git a/autotest/config.yaml b/autotest/config.yaml
index 9357e473bb..587ee6331b 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -44,10 +44,12 @@ turbomind_chat_model:
     - Qwen/Qwen2-1.5B-Instruct
     - Qwen/Qwen1.5-7B-Chat
     - Qwen/Qwen1.5-4B-Chat-AWQ
+    - Qwen/Qwen1.5-MoE-A2.7B-Chat
     - Qwen/Qwen-VL-Chat
     - Qwen/Qwen2.5-0.5B-Instruct
     - Qwen/Qwen2.5-7B-Instruct
     - Qwen/Qwen2-7B-Instruct-GPTQ-Int4
+    - Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4
     - mistralai/Mistral-7B-Instruct-v0.3
     - mistralai/Mixtral-8x7B-Instruct-v0.1
     - lmdeploy/llama2-chat-7b-w4
diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py
index 7e8ebf7b47..a535b0d4c1 100644
--- a/lmdeploy/turbomind/deploy/config.py
+++ b/lmdeploy/turbomind/deploy/config.py
@@ -50,6 +50,8 @@ class ModelConfig:
     expert_num: int = 0
     expert_inter_size: int = 0
     experts_per_token: int = 0
+    moe_shared_gate: int = False
+    moe_norm_topk: int = False
 
     def verify(self):
         invalid = {}
diff --git a/lmdeploy/turbomind/deploy/module.py b/lmdeploy/turbomind/deploy/module.py
index a9f7385376..8d998abe2b 100644
--- a/lmdeploy/turbomind/deploy/module.py
+++ b/lmdeploy/turbomind/deploy/module.py
@@ -140,14 +140,18 @@ class MoeFfn(Ffn):
     requires:
         r.moe_ffn_expert(e, i, kind)
         r.moe_ffn_gate(i)
+        r.moe_ffn_shared_gate(i)
     """
 
     _moe_ffn_expert = 'layers.{0}.moe_ffn.experts.E.{1}.{2}'
-    _moe_ffn_gate = 'layers.{0}.moe_ffn.gate.{1}'
+    _moe_ffn_gate = 'layers.{0}.moe_ffn.gate.weight'
+    _moe_ffn_shared_gate = 'layers.{0}.moe_ffn.shared_gate.weight'
 
     def __init__(self, model: BaseOutputModel):
         super().__init__(model)
         self.expert_num = model.model_config.expert_num
+        self.inter_size = model.model_config.expert_inter_size
+        self.shared_gate = model.model_config.moe_shared_gate
 
     def apply(self, i: int, r: BaseReader):
         for p in get_params(r.moe_ffn_expert()):
@@ -157,7 +161,13 @@ def apply(self, i: int, r: BaseReader):
                   i)
 
         gate = transpose(r.moe_ffn_gate(i))
-        self.model.save_split(gate, self._moe_ffn_gate.format(i, 'weight'))
+        self.model.save_split(gate, self._moe_ffn_gate.format(i))
+
+        if self.shared_gate:
+            shared_gate = transpose(r.moe_ffn_shared_gate(i))
+            # print(shared_gate)
+            self.model.save_split(shared_gate,
+                                  self._moe_ffn_shared_gate.format(i))
 
 
 class Attn(Module):
@@ -248,8 +258,11 @@ class Transformer:
 
     def __init__(self, model: BaseOutputModel):
         self.model = model
-        ffn = MoeFfn if model.model_config.expert_num else Ffn
-        modules = [Attn, LayerNorm, ffn]
+        modules = [Attn, LayerNorm]
+        if model.model_config.inter_size:
+            modules.append(Ffn)
+        if model.model_config.expert_num:
+            modules.append(MoeFfn)
         self.modules = [c(model) for c in modules]
         self.misc = Misc(model)
 
diff --git a/lmdeploy/turbomind/deploy/source_model/mixtral.py b/lmdeploy/turbomind/deploy/source_model/mixtral.py
index 102ede29f2..ff9df2d409 100644
--- a/lmdeploy/turbomind/deploy/source_model/mixtral.py
+++ b/lmdeploy/turbomind/deploy/source_model/mixtral.py
@@ -33,4 +33,6 @@ def model_info(self):
         info['expert_num'] = cfg['num_local_experts']
         info['expert_inter_size'] = cfg['intermediate_size']
         info['experts_per_token'] = cfg['num_experts_per_tok']
+        info['moe_norm_topk'] = True
+        info['inter_size'] = 0
         return info
diff --git a/lmdeploy/turbomind/deploy/source_model/qwen.py b/lmdeploy/turbomind/deploy/source_model/qwen.py
index 0ec0586a37..772bd03037 100644
--- a/lmdeploy/turbomind/deploy/source_model/qwen.py
+++ b/lmdeploy/turbomind/deploy/source_model/qwen.py
@@ -120,3 +120,64 @@ def model_info(self):
         cfg = super().model_info()
         cfg['attn_bias'] = 1
         return cfg
+
+
+class Qwen2MoeReader(LlamaReader):
+
+    ffn_pattern = r'shared_expert\.'
+
+    def moe_ffn_expert(self, e=None, i=None, kind=None):
+        if not kind:
+            return self.filter(r'experts')
+        result = []
+        for key in ['gate', 'down', 'up']:
+            name = f'model.layers.{i}.mlp.experts.{e}.{key}_proj.{kind}'
+            tensor = self.params.get(name)
+            tensor = self.transform(tensor, kind)
+            result.append(tensor)
+        return (*result, )
+
+    def moe_ffn_gate(self, i):
+        return self.params.get(f'model.layers.{i}.mlp.gate.weight')
+
+    def _ffn(self, i: int, kind: str):
+        """Get ffn kind for layer i."""
+        if not kind:
+            return self.filter(self.ffn_pattern)
+        result = []
+        for key in ['gate', 'down', 'up']:
+            tensor = self.params[
+                f'model.layers.{i}.mlp.shared_expert.{key}_proj.{kind}']
+            tensor = self.transform(tensor, kind)
+            result.append(tensor)
+        return (*result, )
+
+    def moe_ffn_shared_gate(self, i):
+        return self.params.get(
+            f'model.layers.{i}.mlp.shared_expert_gate.weight')
+
+
+@INPUT_MODELS.register_module(name='qwen2-moe')
+class Qwen2MoeModel(LlamaModel):
+
+    Reader = Qwen2MoeReader
+
+    def tokenizer_info(self):
+        """https://huggingface.co/Qwen/Qwen1.5-7B-Chat/blob/main/generation_con
+        fig.json."""  # noqa: E501
+        n_words = 152064
+        bos_id = 151643
+        eos_id = 151645
+        return n_words, bos_id, eos_id
+
+    def model_info(self):
+        cfg = self.model_config
+        info = super().model_info()
+        info['expert_num'] = cfg['num_experts']
+        info['expert_inter_size'] = cfg['moe_intermediate_size']
+        info['experts_per_token'] = cfg['num_experts_per_tok']
+        info['inter_size'] = cfg['shared_expert_intermediate_size']
+        info['moe_shared_gate'] = True
+        info['moe_norm_topk_prob'] = cfg['norm_topk_prob']
+        info['attn_bias'] = 1
+        return info
diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py
index fe0819d70f..f6772fddd5 100644
--- a/lmdeploy/turbomind/supported_models.py
+++ b/lmdeploy/turbomind/supported_models.py
@@ -20,6 +20,7 @@
     QWenLMHeadModel='qwen',
     # Qwen2
     Qwen2ForCausalLM='qwen2',
+    Qwen2MoeForCausalLM='qwen2-moe',
     # mistral
     MistralForCausalLM='llama',
     # llava
diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.cu b/src/turbomind/kernels/gemm/moe_utils_v2.cu
index acf6355856..5912c60a8a 100644
--- a/src/turbomind/kernels/gemm/moe_utils_v2.cu
+++ b/src/turbomind/kernels/gemm/moe_utils_v2.cu
@@ -8,7 +8,7 @@
 
 #include <cub/block/block_reduce.cuh>
 #include <cub/block/block_scan.cuh>
-#include <cuda_pipeline_primitives.h>
+#include <cub/warp/warp_scan.cuh>
 
 #include "src/turbomind/kernels/core/array_ops.h"
 #include "src/turbomind/kernels/core/common.h"
@@ -19,7 +19,7 @@ namespace turbomind {
 
 template<int top_k, int block_dim>
 __global__ void MoeGateKernel_V2(float*       scales,  // [e,n]
-                                 int*         masks,   // [E,n], padded
+                                 int8_t*      masks,   // [E,n], padded
                                  int*         accum,   // [E,tiles]
                                  const float* logits,  // [E,n]
                                  int          log_tile,
@@ -88,6 +88,8 @@ __global__ void MoeGateKernel_V2(float*       scales,  // [e,n]
             const int lowbit = (mask & -mask);
             const int e      = 31 - __clz(lowbit);
 
+            // printf("e = %d, ti = %d, idx = %d\n", e, ti, i);
+
             masks[e * tokens_padded + ti] = i;
             atomicAdd(&shared_accum[e][ti >> log_tile], 1);
             top_val[i] = logits[ti * experts + e];
@@ -120,11 +122,11 @@ __global__ void MoeGateKernel_V2(float*       scales,  // [e,n]
     }
 }
 
-template<int block_dim>
-__global__ void MoeScanKernel_V2(int*       f2n,      // [e*n]
+template<int block_dim, class Mask>
+__global__ void MoeScanKernel_v2(int*       f2n,      // [e*n]
                                  int*       en2f,     // [e,n]
                                  int*       offsets,  // [E+1]
-                                 int*       masks,    // [E,n], padded
+                                 Mask*      masks,    // [E,n], padded
                                  const int* accum,    // [E,tiles]
                                  int        log_tile,
                                  int        tiles,
@@ -142,13 +144,15 @@ __global__ void MoeScanKernel_V2(int*       f2n,      // [e*n]
 
     constexpr int vec_size = kMoeGateVecSize;
 
-    using Vec = Array<int, vec_size>;
+    using Vec = Array<Mask, vec_size>;
 
     const int tile_id = blockIdx.x;
     const int ei      = blockIdx.y;
 
-    const int global_tile_id = ei * tiles + tile_id;
+    const int  global_tile_id = ei * tiles + tile_id;
+    const bool is_valid       = global_tile_id <= experts * tiles;
 
+#if 0
     int vacc[4]{};
     {
         int idx = threadIdx.x;
@@ -162,6 +166,18 @@ __global__ void MoeScanKernel_V2(int*       f2n,      // [e*n]
     }
 
     int offset = BlockReduce{temp_storage.reduce}.Sum(vacc);
+#else
+
+    int vacc = 0;
+    for (int i = threadIdx.x; i < global_tile_id; i += block_dim) {
+        if (is_valid && i < global_tile_id) {
+            vacc += accum[i];
+        }
+    }
+
+    int offset = BlockReduce{temp_storage.reduce}.Sum(vacc);
+
+#endif
 
     __shared__ int shared_offset;
 
@@ -200,7 +216,7 @@ __global__ void MoeScanKernel_V2(int*       f2n,      // [e*n]
         const bool pred = vi < tile_vec_end;
 
         Vec data;
-        fill(data, -1);
+        fill(data, Mask{-1});
         if (pred) {
             Ldg(data, mask_ptr[vi].data());
         }
@@ -231,17 +247,328 @@ __global__ void MoeScanKernel_V2(int*       f2n,      // [e*n]
     }
 }
 
+template<int max_expert_num,
+         int max_top_k,
+         //  bool norm_top_k,
+         int items_per_thread,
+         int block_dim,
+         int access_size,
+         class Mask>
+__global__ void MoeGateKernel_v8(float*       scales,  // [e,n]
+                                 Mask*        masks,   // [E,n], padded
+                                 int*         accum,   // [E,tiles]
+                                 const float* logits,  // [n,E]
+                                 int          log_tile,
+                                 int          tiles,
+                                 int          token_num,
+                                 int          token_num_padded,
+                                 int          expert_num,
+                                 int          top_k,
+                                 bool         norm_topk)
+{
+    constexpr int max_tiles         = kMoeGateMaxTiles;
+    constexpr int threads_per_token = max_expert_num / items_per_thread;  // 8
+    constexpr int tokens_per_cta    = block_dim / threads_per_token;
+
+    // We use bits in a uint32_t to represent selected experts
+    static_assert(items_per_thread <= 32);
+    // We use warp-level primitives for reduction
+    static_assert(threads_per_token <= 32);
+
+    static_assert((threads_per_token & (threads_per_token - 1)) == 0);
+
+    const int thread_idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+    const int ti = thread_idx / threads_per_token;
+    const int ei = thread_idx % threads_per_token;
+
+    const int bti = threadIdx.x / threads_per_token;
+
+    const int warp_ti = threadIdx.x % WARP_SIZE / threads_per_token;
+
+    const int warp_offset  = thread_idx / WARP_SIZE * WARP_SIZE / threads_per_token;
+    const int block_offset = thread_idx / block_dim * block_dim / threads_per_token;
+
+    float data[items_per_thread];
+    int   idxs[items_per_thread];
+
+#if 0
+    PRAGMA_UNROLL
+    for (int i = 0; i < items_per_thread; ++i) {
+        data[i] = -std::numeric_limits<float>::infinity();
+        idxs[i] = threads_per_token * (i / access_size * access_size) + i % access_size + ei * access_size;
+    }
+    if (ti < token_num) {
+        PRAGMA_UNROLL
+        for (int i = 0; i < items_per_thread; i += access_size) {
+            const int e = threads_per_token * i + ei * access_size;
+            if (e < expert_num) {
+                Ldg((Array<float, access_size>&)data[i], &logits[ti * expert_num + e]);
+            }
+        }
+    }
+
+    __shared__ union {
+        struct {
+            // +1 padding greatly reduced (-80%) bank conflicts
+            int   shared_accum[max_tiles][max_expert_num + 1];
+            float shared_scales[max_top_k][tokens_per_cta];
+            int   shared_exp_id[max_top_k][tokens_per_cta];
+        };
+    } smem;
+#elif 1
+    PRAGMA_UNROLL
+    for (int i = 0; i < items_per_thread; ++i) {
+        data[i] = -std::numeric_limits<float>::infinity();
+        // idxs[i] = threads_per_token * (i / access_size * access_size) + i % access_size + ei * access_size;
+        idxs[i] = ei * items_per_thread + i;
+    }
+    if (ti < token_num) {
+        PRAGMA_UNROLL
+        for (int i = 0; i < items_per_thread; i += access_size) {
+            // const int e = threads_per_token * i + ei * access_size;
+            const int e = ei * items_per_thread + i;
+            if (e < expert_num) {
+                Ldg((Array<float, access_size>&)data[i], &logits[ti * expert_num + e]);
+            }
+        }
+    }
+
+    __shared__ union {
+        struct {
+            // +1 padding greatly reduced (-80%) bank conflicts
+            int   shared_accum[max_tiles][max_expert_num + 1];
+            float shared_scales[max_top_k][tokens_per_cta];
+            int   shared_exp_id[max_top_k][tokens_per_cta];
+        };
+    } smem;
+#else
+
+    const int warp_id = threadIdx.x / WARP_SIZE;
+    const int lane_id = threadIdx.x % WARP_SIZE;
+
+    constexpr int vecs_per_thread = items_per_thread / access_size;
+
+    using Vec            = Array<float, access_size>;
+    constexpr int banks  = 128 / sizeof(Vec);
+    constexpr int chunks = 4;  // block_dim / WARP_SIZE;
+
+    __shared__ union {
+        Vec shared_data[chunks][vecs_per_thread * WARP_SIZE / banks][banks + 1];
+        struct {
+            // +1 padding greatly reduced (-80%) bank conflicts
+            int   shared_accum[max_tiles][max_expert_num + 1];
+            float shared_scales[max_top_k][tokens_per_cta];
+            int   shared_exp_id[max_top_k][tokens_per_cta];
+        };
+    } smem;
+
+    __align__(16) Vec vecs[vecs_per_thread];
+
+    {
+        const int warp_end = min(warp_offset + WARP_SIZE / threads_per_token, token_num) * expert_num;
+        int       p        = warp_offset * expert_num + access_size * lane_id;
+        PRAGMA_UNROLL
+        for (int i = 0; i < vecs_per_thread; ++i) {
+            fill(vecs[i], -std::numeric_limits<float>::infinity());
+            // const int p = warp_offset * expert_num + access_size * (lane_id + i * WARP_SIZE);
+            if (p < warp_end) {
+                Ldg(vecs[i], &logits[p]);
+            }
+            p += access_size * WARP_SIZE;
+        }
+    }
+
+    PRAGMA_UNROLL
+    for (int c = 0; c < block_dim / WARP_SIZE; c += chunks) {
+        PRAGMA_UNROLL
+        for (int i = 0; i < vecs_per_thread; ++i) {
+            int p = i * WARP_SIZE + lane_id;
+            if (c <= warp_id && warp_id < c + chunks) {
+                Store(smem.shared_data[warp_id - c][p / banks][p % banks].data(), vecs[i]);
+            }
+        }
+
+        __syncwarp();
+
+        PRAGMA_UNROLL
+        for (int i = 0; i < vecs_per_thread; ++i) {
+            int p = lane_id * vecs_per_thread + i;
+            if (c <= warp_id && warp_id < c + chunks) {
+                Load(vecs[i], smem.shared_data[warp_id - c][p / banks][p % banks].data());
+            }
+        }
+
+        __syncthreads();
+    }
+
+    PRAGMA_UNROLL
+    for (int i = 0; i < items_per_thread; ++i) {
+        idxs[i] = ei * items_per_thread + i;
+    }
+    PRAGMA_UNROLL
+    for (int i = 0; i < vecs_per_thread; ++i) {
+        (Array<float, access_size>&)data[i * access_size] = vecs[i];
+    }
+
+#endif
+
+    constexpr float kLog2e = 1.4426950408889634074;
+
+    unsigned mask = (unsigned)-1;
+    float    max_logit;
+
+    int   count{};
+    float sum_prob{};
+
+    const int warp_ti_offset = warp_ti * threads_per_token;
+
+    auto run = [&](int k) {
+        unsigned bit     = 1;
+        unsigned max_bit = 0;
+        float    max_val = -std::numeric_limits<float>::infinity();
+        // local maximum
+        PRAGMA_UNROLL
+        for (int i = 0; i < items_per_thread; ++i) {
+            if ((mask & bit) && data[i] > max_val) {
+                max_bit = bit;
+                max_val = data[i];
+            }
+            asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit));
+        }
+
+        if (k == 0) {
+            PRAGMA_UNROLL
+            for (int i = 0; i < items_per_thread; ++i) {
+                data[i] *= kLog2e;
+            }
+        }
+
+        int   g_max_ei  = ei;
+        float g_max_val = max_val;
+        if constexpr (threads_per_token > 1) {
+            // global maximum
+            PRAGMA_UNROLL
+            for (int m = threads_per_token / 2; m >= 1; m /= 2) {
+                g_max_val = fmaxf(g_max_val, __shfl_xor_sync((uint32_t)-1, g_max_val, m));
+            }
+            // tie breaking
+            const auto active = __ballot_sync((uint32_t)-1, max_val == g_max_val);
+            g_max_ei          = __ffs(active >> (unsigned)warp_ti_offset) - 1;
+        }
+        if (k == 0) {
+            max_logit = g_max_val;
+        }
+        if (ei == g_max_ei) {
+            mask -= max_bit;
+            ++count;
+        }
+    };
+
+    run(0);
+
+    for (int k = 1; k < top_k; ++k) {
+        run(k);
+    }
+
+    mask = ~mask;
+
+    int used[items_per_thread];
+    {
+        unsigned bit = 1;
+        PRAGMA_UNROLL
+        for (int i = 0; i < items_per_thread; ++i) {
+            used[i] = (mask & bit) > 0;
+            asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit));
+        }
+    }
+
+    PRAGMA_UNROLL
+    for (int i = 0; i < items_per_thread; ++i) {
+        if (!norm_topk || used[i]) {
+            data[i] = exp2f(data[i] - max_logit);
+            sum_prob += data[i];
+        }
+    }
+
+    PRAGMA_UNROLL
+    for (int m = threads_per_token / 2; m >= 1; m /= 2) {
+        sum_prob += __shfl_xor_sync((uint32_t)-1, sum_prob, m);
+    }
+
+    sum_prob = fdividef(1.f, sum_prob);
+
+    using WarpScan = cub::WarpScan<int, threads_per_token>;
+    __shared__ typename WarpScan::TempStorage temp_storage[tokens_per_cta];
+
+    int idx{};
+    WarpScan{temp_storage[bti]}.ExclusiveSum(count, idx);
+
+    PRAGMA_UNROLL
+    for (int i = 0; i < items_per_thread; ++i) {
+        if (used[i]) {
+            smem.shared_exp_id[idx][bti] = idxs[i];
+            smem.shared_scales[idx][bti] = data[i] * sum_prob;
+            ++idx;
+        }
+    }
+
+    PRAGMA_UNROLL
+    for (int i = 0; i < max_tiles * max_expert_num; i += block_dim) {
+        int e                   = (i + threadIdx.x) % max_expert_num;
+        int t                   = (i + threadIdx.x) / max_expert_num;
+        smem.shared_accum[t][e] = 0;
+    }
+
+    __syncthreads();
+
+    constexpr int k_per_thread = cdiv(max_top_k, threads_per_token);
+
+    const int bti2 = threadIdx.x % tokens_per_cta;
+    const int ei2  = threadIdx.x / tokens_per_cta;
+    const int ti2  = blockIdx.x * tokens_per_cta + bti2;
+
+    PRAGMA_UNROLL
+    for (int i = 0; i < k_per_thread; ++i) {
+        const int   idx       = ei2 * k_per_thread + i;
+        const int   expert_id = smem.shared_exp_id[idx][bti2];
+        const float scale     = smem.shared_scales[idx][bti2];
+
+        if (ti2 < token_num && idx < top_k) {
+            masks[expert_id * token_num_padded + ti2] = idx;
+            scales[idx * token_num + ti2]             = scale;
+            atomicAdd(&smem.shared_accum[ti2 >> log_tile][expert_id], 1);
+
+            // printf("%d %d %f\n", idx, expert_id, scale);
+        }
+    }
+
+    __syncthreads();
+
+    for (int i = 0; i < max_expert_num * max_tiles; i += block_dim) {
+        int t = (threadIdx.x + i) % max_tiles;
+        int e = (threadIdx.x + i) / max_tiles;
+        if (e < expert_num && t < tiles) {
+            atomicAdd(accum + e * tiles + t, smem.shared_accum[t][e]);
+        }
+    }
+}
+
+template<int N>
+inline constexpr std::integral_constant<int, N> _Int{};
+
 void invokeMoeGate_V2(int*         f2n,            // [e*n]  -> n
                       int*         en2f,           // [e,n] -> n*e
                       int*         offsets,        // [E+1]
                       float*       scales,         // [e,n]
-                      int*         masks,          // [E,n]
+                      void*        masks,          // [E,n]
                       int*         accum,          // [E]
                       const float* logits,         // [e,n]
                       int          tokens,         //  n
                       int          tokens_padded,  //  round_up(n, 4)
                       int          experts,        //  E
                       int          experts_per_token,
+                      bool         norm_topk,
                       cudaStream_t st)
 {
     constexpr int base_log_tile = 9;
@@ -254,48 +581,64 @@ void invokeMoeGate_V2(int*         f2n,            // [e*n]  -> n
 
     // std::cout << log_tile << " " << tiles << "\n";
 
-    {
-        constexpr int threads = 128;
-        const int     blocks  = ceil_div(tokens, threads);
+    auto invoke = [&](auto max_expert_num, auto top_k, auto items_per_thread) {
+        constexpr int thrs_per_tok = max_expert_num.value / items_per_thread.value;
+        constexpr int threads      = 256;
+        const int     blocks       = ceil_div(tokens, threads / thrs_per_tok);
+
+        cudaMemsetAsync(masks, -1, sizeof(int8_t) * experts * tokens_padded, st);
 
-        auto invoke = [&](auto e) {
-            static constexpr int top_k = decltype(e)::value;
-            MoeGateKernel_V2<top_k, threads><<<blocks, threads, 0, st>>>(  //
+        MoeGateKernel_v8<max_expert_num.value, top_k.value, items_per_thread.value, threads, 4>
+            <<<blocks, threads, 0, st>>>(  //
                 scales,
-                masks,
+                (int8_t*)masks,
                 accum,
                 logits,
                 log_tile,
                 tiles,
                 tokens,
                 tokens_padded,
-                experts);
-        };
+                experts,
+                experts_per_token,
+                norm_topk);
+    };
 
-        switch (experts_per_token) {
-            case 2:
-                invoke(std::integral_constant<int, 2>{});
-                break;
-            // case 4:
-            //     invoke(std::integral_constant<int, 4>{});
-            //     break;
-            default:
-                std::cerr << __FILE__ << ":" << __LINE__ << " Not implemented. " << std::endl;
-                std::abort();
+    auto fail = [&] {
+        std::cerr << "unsupported moe config: expert_num=" << experts << ", top_k=" << experts_per_token << "\n";
+        std::abort();
+    };
+
+    if (experts <= 8) {
+        if (experts_per_token <= 2) {
+            invoke(_Int<8>, _Int<2>, _Int<8>);
+        }
+        else {
+            invoke(_Int<8>, _Int<8>, _Int<8>);
         }
     }
-
-    // return;
+    else if (experts <= 64) {
+        if (experts_per_token <= 4) {
+            invoke(_Int<64>, _Int<4>, _Int<16>);
+        }
+        else if (experts_per_token <= 8) {
+            invoke(_Int<64>, _Int<8>, _Int<16>);
+        }
+        else {
+            fail();
+        }
+    }
+    else {
+        fail();
+    }
 
     {
-        // Check: tiles * experts <= threads
-
         constexpr int threads = (1 << base_log_tile) / kMoeGateVecSize;
         const dim3    blocks(tiles, experts + 1);
-        MoeScanKernel_V2<threads><<<blocks, threads, 0, st>>>(f2n,  //
+
+        MoeScanKernel_v2<threads><<<blocks, threads, 0, st>>>(f2n,  //
                                                               en2f,
                                                               offsets,
-                                                              masks,
+                                                              (int8_t*)masks,
                                                               accum,
                                                               log_tile,
                                                               tiles,
@@ -338,10 +681,11 @@ void invokeMoeGather(T* dst, const T* src, const int* f2n, int tokens, int exper
 template void invokeMoeGather(uint16_t*, const uint16_t*, const int*, int, int, int, cudaStream_t);
 
 template<int vec_size, int exp_k, int block_dim, class T>
-__global__ void MoeReduceKernel(T*           dst,     // [  n, d]
-                                const T*     src,     // [e*n, d]
-                                const float* scales,  // [  e, n]
-                                const int*   en2f,    // [  e, n] :: (e,n) -> e*n
+__global__ void MoeReduceKernel(T*           dst,         // [  n, d]
+                                const T*     src,         // [e*n, d]
+                                const float* scales,      // [  e, n]
+                                const int*   en2f,        // [  e, n] :: (e,n) -> e*n
+                                const float* dst_scales,  // [n]
                                 int          dims,
                                 int          tokens)
 {
@@ -351,6 +695,12 @@ __global__ void MoeReduceKernel(T*           dst,     // [  n, d]
 
     auto dst_ptr = (Vec*)dst + dims * ti;
 
+    float dst_scale = 0;
+    if (dst_scales) {
+        dst_scale = dst_scales[ti];
+        dst_scale = fdividef(1.f, 1.f + expf(-dst_scale));
+    }
+
     // Should be warp uniforms
     const Vec* src_ptr[exp_k];
     float      scale[exp_k];
@@ -362,6 +712,12 @@ __global__ void MoeReduceKernel(T*           dst,     // [  n, d]
 
     for (int i = threadIdx.x; i < dims; i += block_dim) {
         Array<float, vec_size> accum{};
+        if (dst_scales) {
+            Vec v;
+            Ldg(v, dst_ptr[i].data());
+            using namespace ops;
+            accum = cast<float>(v) * dst_scale;
+        }
         PRAGMA_UNROLL
         for (int e = 0; e < exp_k; ++e) {
             Vec v;
@@ -379,6 +735,7 @@ void invokeMoeReduce(T*           dst,
                      const T*     src,
                      const float* scales,
                      const int*   en2f,
+                     const float* dst_scales,
                      int          tokens,
                      int          experts_per_token,
                      int          dims,
@@ -395,6 +752,7 @@ void invokeMoeReduce(T*           dst,
             src,
             scales,
             en2f,
+            dst_scales,
             dims / vec_size,
             tokens);
     };
@@ -404,19 +762,22 @@ void invokeMoeReduce(T*           dst,
             return invoke(std::integral_constant<int, 1>{});
         case 2:
             return invoke(std::integral_constant<int, 2>{});
-        // case 4:
-        //     return invoke(std::integral_constant<int, 4>{});
-        // case 6:
-        //     return invoke(std::integral_constant<int, 6>{});
+        case 4:
+            return invoke(std::integral_constant<int, 4>{});
+        case 6:
+            return invoke(std::integral_constant<int, 6>{});
+        case 8:
+            return invoke(std::integral_constant<int, 8>{});
         default:
             fprintf(stderr, "Unsupported experts_per_token %d\n", experts_per_token);
             std::abort();
     }
 }
 
-template void invokeMoeReduce(half*, const half*, const float*, const int*, int, int, int, cudaStream_t);
+template void invokeMoeReduce(half*, const half*, const float*, const int*, const float*, int, int, int, cudaStream_t);
 #ifdef ENABLE_BF16
-template void invokeMoeReduce(nv_bfloat16*, const nv_bfloat16*, const float*, const int*, int, int, int, cudaStream_t);
+template void
+invokeMoeReduce(nv_bfloat16*, const nv_bfloat16*, const float*, const int*, const float*, int, int, int, cudaStream_t);
 #endif
 
 std::vector<int> SampleUniform(int token_num, int expert_num, int exp_per_tok, std::mt19937& g)
diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.h b/src/turbomind/kernels/gemm/moe_utils_v2.h
index 334e2de272..0e4c36af09 100644
--- a/src/turbomind/kernels/gemm/moe_utils_v2.h
+++ b/src/turbomind/kernels/gemm/moe_utils_v2.h
@@ -14,13 +14,14 @@ void invokeMoeGate_V2(int*         f2n,
                       int*         en2f,
                       int*         offsets,
                       float*       scales,
-                      int*         masks,
+                      void*        masks,
                       int*         accum,
                       const float* logits,
                       int          tokens,
                       int          tokens_padded,
                       int          experts,
                       int          exp_per_tok,
+                      bool         norm_topk,
                       cudaStream_t st);
 
 template<class T>
@@ -49,6 +50,7 @@ void invokeMoeReduce(T*           dst,
                      const T*     src,
                      const float* scales,
                      const int*   en2f,
+                     const float* dst_scales,
                      int          tokens,
                      int          experts_per_token,
                      int          dims,
diff --git a/src/turbomind/kernels/gemm/test/test_moe_utils.cu b/src/turbomind/kernels/gemm/test/test_moe_utils.cu
index a311162193..47e3bfdb16 100644
--- a/src/turbomind/kernels/gemm/test/test_moe_utils.cu
+++ b/src/turbomind/kernels/gemm/test/test_moe_utils.cu
@@ -26,6 +26,25 @@ void print_vecs(const T* data, int m, int k, std::string msg, int width = 4)
     }
 }
 
+template<class T>
+void diff_vecs(const T* data, const T* refs, int m, int k, std::string msg)
+{
+    if (!msg.empty()) {
+        std::cout << msg << ": [" << m << ", " << k << "]\n";
+    }
+    for (int mm = 0; mm < m; ++mm) {
+        std::cout << "m=" << mm << ": ";
+        for (int kk = 0; kk < k; ++kk) {
+            const auto& x = data[mm * k + kk];
+            const auto& y = refs[mm * k + kk];
+            if (x != y) {
+                std::cout << kk << "(" << x << ", " << y << ") ";
+            }
+        }
+        std::cout << "\n";
+    }
+}
+
 #if 0
 void func()
 {
@@ -190,7 +209,7 @@ void moe_gate_ref(int                            tokens,
     }
 }
 
-void mask2eids(const universal_vector<int>& masks, universal_vector<int>& eids, int tokens, int expert_num)
+void mask2eids(universal_vector<int8_t>& masks, universal_vector<int>& eids, int tokens, int expert_num)
 {
     const int tokens_padded = masks.size() / expert_num;
     // std::cout << eids.size() << std::endl;
@@ -228,13 +247,13 @@ bool test_moe_gate(int                     tokens,  //
     const int tokens_padded = (tokens + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize;
     // const int max_coords    = get_max_coords(tokens, expert_num, experts_per_token, tiling);
 
-    universal_vector<int>   offsets(expert_num + 1);
-    universal_vector<int>   accum(expert_num * kMoeGateMaxTiles);
-    universal_vector<int>   masks(expert_num * tokens_padded);
-    universal_vector<int>   eids(experts_per_token * tokens);
-    universal_vector<int>   f2n(experts_per_token * tokens);
-    universal_vector<int>   en2f(experts_per_token * tokens);
-    universal_vector<float> scales(experts_per_token * tokens);
+    universal_vector<int>    offsets(expert_num + 1);
+    universal_vector<int>    accum(expert_num * kMoeGateMaxTiles);
+    universal_vector<int8_t> masks(expert_num * tokens_padded);
+    universal_vector<int>    eids(experts_per_token * tokens);
+    universal_vector<int>    f2n(experts_per_token * tokens);
+    universal_vector<int>    en2f(experts_per_token * tokens);
+    universal_vector<float>  scales(experts_per_token * tokens);
     // universal_vector<int2>  coords(max_coords);
     // thrust::fill(coords.begin(), coords.end(), int2{-1, 0});
 
@@ -246,8 +265,16 @@ bool test_moe_gate(int                     tokens,  //
 
     moe_gate_ref(tokens, expert_num, experts_per_token, logits, offsets_ref, eids_ref, f2n_ref, en2f_ref, scales_ref);
 
-    for (int i = 0; i < 10; ++i) {
+    cudaMemPrefetchAsync(f2n.data().get(), sizeof(int) * f2n.size(), 0);
+    cudaMemPrefetchAsync(en2f.data().get(), sizeof(int) * en2f.size(), 0);
+    cudaMemPrefetchAsync(offsets.data().get(), sizeof(int) * offsets.size(), 0);
+    cudaMemPrefetchAsync(scales.data().get(), sizeof(float) * scales.size(), 0);
+    cudaMemPrefetchAsync(logits.data().get(), sizeof(float) * logits.size(), 0);
+
+    for (int i = 0; i < 1; ++i) {
+        gemm::CacheFlushing::flush();
         cudaMemset(accum.data().get(), 0, sizeof(int) * accum.size());
+        cudaMemset(masks.data().get(), -1, sizeof(int8_t) * masks.size());
         invokeMoeGate_V2(f2n.data().get(),
                          en2f.data().get(),
                          offsets.data().get(),
@@ -259,6 +286,7 @@ bool test_moe_gate(int                     tokens,  //
                          tokens_padded,
                          expert_num,
                          experts_per_token,
+                         true,
                          0);
     }
 
@@ -306,7 +334,10 @@ bool test_moe_gate(int                     tokens,  //
         success = false;
     }
 
-    if (!success || false) {
+    if (!success && 1) {
+
+        diff_vecs(eids.data().get(), eids_ref.data().get(), experts_per_token, tokens, "eids");
+
         print_vecs(offsets_ref.data().get(), 1, expert_num + 1, "offsets_ref");
         print_vecs(offsets.data().get(), 1, expert_num + 1, "offsets");
 
@@ -322,32 +353,32 @@ bool test_moe_gate(int                     tokens,  //
         print_vecs(scales_ref.data().get(), experts_per_token, tokens, "scales_ref", 12);
         print_vecs(scales.data().get(), experts_per_token, tokens, "scales", 12);
 
-        print_vecs(accum.data().get(), expert_num, 1, "accum");
+        // print_vecs(accum.data().get(), expert_num, 1, "accum");
 
         // print_vecs(coords.data().get(), 1, max_coords, "coords");
 
-        thrust::host_vector<int4> tile_offsets(tape.max_ctas);
-        std::cout << tape.max_ctas << std::endl;
-        cudaMemcpy(tile_offsets.data(), tape.tile_offsets, sizeof(int4) * tile_offsets.size(), cudaMemcpyDefault);
-        cudaDeviceSynchronize();
-
-        std::cout << "coords:\n";
-        int last = -1;
-        for (int i = 0; i < tape.max_ctas; ++i) {
-            auto& c = tile_offsets[i];
-            if (last >= 0 && c.w != last) {
-                std::cout << "\n";
-            }
-            if (c.w == -1) {
-                std::cout << i << "\n";
-                break;
-            }
-            last = c.w;
-            std::stringstream ss;
-            ss << c.x << "," << c.y;
-            std::cout << std::setw(6) << ss.str();
-        }
-        std::cout << "\n";
+        // thrust::host_vector<int4> tile_offsets(tape.max_ctas);
+        // std::cout << tape.max_ctas << std::endl;
+        // cudaMemcpy(tile_offsets.data(), tape.tile_offsets, sizeof(int4) * tile_offsets.size(), cudaMemcpyDefault);
+        // cudaDeviceSynchronize();
+
+        // std::cout << "coords:\n";
+        // int last = -1;
+        // for (int i = 0; i < tape.max_ctas; ++i) {
+        //     auto& c = tile_offsets[i];
+        //     if (last >= 0 && c.w != last) {
+        //         std::cout << "\n";
+        //     }
+        //     if (c.w == -1) {
+        //         std::cout << i << "\n";
+        //         break;
+        //     }
+        //     last = c.w;
+        //     std::stringstream ss;
+        //     ss << c.x << "," << c.y;
+        //     std::cout << std::setw(6) << ss.str();
+        // }
+        // std::cout << "\n";
     }
 
     return success;
@@ -358,7 +389,11 @@ int main()
     gemm::Tape       tape{};
     constexpr Tiling tiling{14336, 128, {128, 128, 32}};
 
-    test_moe_gate(8192, 8, 2, tape, tiling);
+    // test_moe_gate(32768 * 4, 60, 4, tape, tiling);
+    // test_moe_gate(32768, 64, 8, tape, tiling);
+    // test_moe_gate(8, 60, 4, tape, tiling);
+
+    test_moe_gate(65536, 8, 2, tape, tiling);
     return 0;
 
     for (int i = 1; i < 16384; ++i) {
diff --git a/src/turbomind/kernels/gemm/test/testbed.h b/src/turbomind/kernels/gemm/test/testbed.h
index 6b1ec88f58..7a089fbdf2 100644
--- a/src/turbomind/kernels/gemm/test/testbed.h
+++ b/src/turbomind/kernels/gemm/test/testbed.h
@@ -514,6 +514,7 @@ class Testbed {
                             c_e_.data().get(),
                             moe_scales_.data().get(),
                             moe_en2f_.data().get(),
+                            nullptr,
                             batch_size_,
                             expert_ids_.size() / batch_size_,
                             output_dims_,
@@ -523,6 +524,7 @@ class Testbed {
                             c_e_ref_.data().get(),
                             moe_scales_.data().get(),
                             moe_en2f_.data().get(),
+                            nullptr,
                             batch_size_,
                             expert_ids_.size() / batch_size_,
                             output_dims_,
diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
index 2d68ef3535..f6f9ab0efa 100644
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
@@ -137,6 +137,7 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int        layer_idx,
                                   moe_param.inter_size,
                                   moe_param.expert_num,
                                   moe_param.method,
+                                  moe_param.shared_gate,
                                   tensor_para_size_,
                                   weight_type,
                                   group_size,
@@ -349,18 +350,22 @@ void LlamaDecoderLayerWeight<T>::mallocWeights()
     mallocWeights(self_attn_weights.qkv, attn_bias_);
     mallocWeights(self_attn_weights.output, attn_bias_);
 
-    if (moe_weights.experts.empty()) {
+    if (inter_size_) {
         mallocWeights(ffn_weights.gating, false);
         mallocWeights(ffn_weights.intermediate, false);
         mallocWeights(ffn_weights.output, false);
     }
-    else {
+
+    if (!moe_weights.experts.empty()) {
         mallocWeights(moe_weights.gate, false);
         for (auto& e : moe_weights.experts) {
             mallocWeights(e.gating, false);
             mallocWeights(e.intermediate, false);
             mallocWeights(e.output, false);
         }
+        if (moe_weights.shared_gate.output_dims) {
+            mallocWeights(moe_weights.shared_gate, false);
+        }
     }
 }
 
@@ -375,10 +380,25 @@ LlamaDecoderLayerWeight<T>::~LlamaDecoderLayerWeight()
     freeWeights(self_attn_weights.qkv);
     freeWeights(self_attn_weights.output);
 
-    freeWeights(ffn_weights.fused_gating_intermediate);
-    freeWeights(ffn_weights.gating);
-    freeWeights(ffn_weights.intermediate);
-    freeWeights(ffn_weights.output);
+    if (inter_size_) {
+        freeWeights(ffn_weights.fused_gating_intermediate);
+        freeWeights(ffn_weights.gating);
+        freeWeights(ffn_weights.intermediate);
+        freeWeights(ffn_weights.output);
+    }
+
+    if (!moe_weights.experts.empty()) {
+        freeWeights(moe_weights.gate);
+        for (auto& e : moe_weights.experts) {
+            freeWeights(e.fused_gating_intermediate);
+            freeWeights(e.gating);
+            freeWeights(e.intermediate);
+            freeWeights(e.output);
+        }
+        if (moe_weights.shared_gate.kernel) {
+            freeWeights(moe_weights.shared_gate);
+        }
+    }
 }
 
 template<typename T>
@@ -428,23 +448,30 @@ TensorMap LlamaDecoderLayerWeight<T>::getParams(std::string prefix)
     getWeightTensor(self_attn_weights.qkv, attn_bias_, get_prefix("attention.w_qkv"), output);
     getWeightTensor(self_attn_weights.output, attn_bias_, get_prefix("attention.wo"), output);
 
-    if (moe_weights.experts.empty()) {
+    if (inter_size_) {
         getWeightTensor(ffn_weights.gating, false, get_prefix("feed_forward.w1"), output);
         getWeightTensor(ffn_weights.intermediate, false, get_prefix("feed_forward.w3"), output);
         getWeightTensor(ffn_weights.output, false, get_prefix("feed_forward.w2"), output);
     }
-    else {
+
+    if (!moe_weights.experts.empty()) {
         output.insert(
             concat(prefix, "moe_ffn.gate.weight"),
             Tensor{MEMORY_GPU, getTensorType<T>(), {moe_weights.gate.kernel_size()}, moe_weights.gate.kernel});
         auto& experts = moe_weights.experts;
         for (size_t i = 0; i < experts.size(); ++i) {
             const std::string name = "moe_ffn.experts." + std::to_string(i);
-            // std::cerr << "FUCK " << get_prefix(concat(name, "w1")) << "\n";
             getWeightTensor(experts[i].gating, false, get_prefix(concat(name, "w1")), output);
             getWeightTensor(experts[i].intermediate, false, get_prefix(concat(name, "w3")), output);
             getWeightTensor(experts[i].output, false, get_prefix(concat(name, "w2")), output);
         }
+        if (moe_weights.shared_gate.kernel) {
+            output.insert(concat(prefix, "moe_ffn.shared_gate.weight"),
+                          Tensor{MEMORY_GPU,
+                                 getTensorType<T>(),
+                                 {moe_weights.shared_gate.kernel_size()},
+                                 moe_weights.shared_gate.kernel});
+        }
     }
 
     return output;
@@ -681,10 +708,13 @@ void LlamaDecoderLayerWeight<T>::prepare(void* workspace, size_t size, const cud
         convert(ffn.output, is_fused_moe, workspace, size, is_16xx);
     };
 
-    if (moe_weights.experts.empty()) {
+    if (inter_size_) {
+        // std::cerr << "process FFN\n";
         process_ffn(ffn_weights, false);
     }
-    else {
+
+    if (!moe_weights.experts.empty()) {
+        // std::cerr << "process MoE\n";
         std::vector<std::pair<void*, int>> fused_ptrs;
         std::vector<std::pair<void*, int>> output_ptrs;
         std::vector<std::pair<void*, int>> fused_param_ptrs;
diff --git a/src/turbomind/models/llama/LlamaDenseWeight.h b/src/turbomind/models/llama/LlamaDenseWeight.h
index 9a895243bc..169fb53bcf 100644
--- a/src/turbomind/models/llama/LlamaDenseWeight.h
+++ b/src/turbomind/models/llama/LlamaDenseWeight.h
@@ -145,24 +145,28 @@ struct LlamaFfnWeight {
     LlamaFfnWeight(
         size_t hidden_dim, size_t inter_size, size_t tp, WeightType weight_type, int group_size, bool fuse_silu_act)
     {
+        inter_size /= tp;
+
+        this->inter_size = inter_size;
+
         gating.input_dims  = hidden_dim;
-        gating.output_dims = inter_size / tp;
+        gating.output_dims = inter_size;
         gating.type        = weight_type;
         gating.group_size  = group_size;
 
         intermediate.input_dims  = hidden_dim;
-        intermediate.output_dims = inter_size / tp;
+        intermediate.output_dims = inter_size;
         intermediate.type        = weight_type;
         intermediate.group_size  = group_size;
 
         fused_gating_intermediate.input_dims  = hidden_dim;
-        fused_gating_intermediate.output_dims = inter_size / tp * 2;
+        fused_gating_intermediate.output_dims = inter_size * 2;
         fused_gating_intermediate.type        = weight_type;
         fused_gating_intermediate.group_size  = group_size;
 
         is_fused_silu = fuse_silu_act;
 
-        output.input_dims  = inter_size / tp;
+        output.input_dims  = inter_size;
         output.output_dims = hidden_dim;
         output.type        = weight_type;
         output.group_size  = group_size;
@@ -173,6 +177,7 @@ struct LlamaFfnWeight {
     LlamaDenseWeight<T> output;
     LlamaDenseWeight<T> fused_gating_intermediate;
 
+    int  inter_size{};
     bool is_fused_silu{};
 };
 
@@ -185,11 +190,15 @@ struct MoeFfnWeight {
                  int        inter_size,
                  int        expert_num,
                  int        method,
+                 bool       has_shared_gate,
                  size_t     tp,
                  WeightType weight_type,
                  int        group_size,
                  bool       fuse_silu_act)
     {
+
+        // printf("%d %d %d\n", (int)hidden_dim, (int)inter_size, (int)expert_num);
+
         if (expert_num == 0) {
             return;
         }
@@ -208,11 +217,23 @@ struct MoeFfnWeight {
             // inter size is divided by tp in `FfnWeight`
             e = LlamaFfnWeight<T>{hidden_dim, (size_t)inter_size, tp, weight_type, group_size, fuse_silu_act};
         }
+
+        if (has_shared_gate) {
+            shared_gate.input_dims  = hidden_dim;
+            shared_gate.output_dims = 1;
+            shared_gate.type        = get_default_weight_type<T>();
+            gate.group_size         = group_size;
+        }
+        else {
+            shared_gate = {};
+        }
     }
 
     LlamaDenseWeight<T>            gate;
     std::vector<LlamaFfnWeight<T>> experts;
 
+    LlamaDenseWeight<T> shared_gate;
+
     LlamaFfnWeight<T> block;
 
     int method{};
diff --git a/src/turbomind/models/llama/LlamaFfnLayer.cc b/src/turbomind/models/llama/LlamaFfnLayer.cc
index f9ee0c4ad4..8cce207203 100644
--- a/src/turbomind/models/llama/LlamaFfnLayer.cc
+++ b/src/turbomind/models/llama/LlamaFfnLayer.cc
@@ -28,10 +28,11 @@ namespace turbomind {
 
 template<typename T>
 void LlamaFfnLayer<T>::allocateBuffer(size_t                     token_num,
+                                      int                        inter_size,
                                       const LlamaDenseWeight<T>* gating,
                                       const LlamaDenseWeight<T>* inter)
 {
-    const size_t sz = token_num * inter_size_;
+    const size_t sz = token_num * inter_size;
 
     const size_t sz_gate  = token_num * gating->lora.r;
     const size_t sz_inter = token_num * inter->lora.r;
@@ -51,24 +52,24 @@ template<typename T>
 void LlamaFfnLayer<T>::freeBuffer()
 {
     if (is_allocate_buffer_) {
-        // allocator_->free((void**)&inter_buf_);
         allocator_->free((void**)&gating_buf_);
         is_allocate_buffer_ = false;
     }
 }
 
 template<typename T>
-void LlamaFfnLayer<T>::activation(int token_num, bool is_chunked)
+void LlamaFfnLayer<T>::activation(int token_num, int inter_size, bool is_chunked)
 {
     NvtxScope scope("activation");
     if (is_chunked) {
+        // gate & up are in the SAME buffer
         invokeGenericActivation_v2<SiluActivation>(
-            gating_buf_, gating_buf_ + inter_size_, inter_size_ * 2, token_num, inter_size_, stream_);
+            gating_buf_, gating_buf_ + inter_size, inter_size * 2, token_num, inter_size, stream_);
         sync_check_cuda_error();
     }
     else {
-        invokeGenericActivation_v2<SiluActivation>(
-            gating_buf_, inter_buf_, inter_size_, token_num, inter_size_, stream_);
+        // gate & up are in separate buffers
+        invokeGenericActivation_v2<SiluActivation>(gating_buf_, inter_buf_, inter_size, token_num, inter_size, stream_);
         sync_check_cuda_error();
     }
 }
@@ -88,11 +89,11 @@ void LlamaFfnLayer<T>::forward(TensorMap*               output_tensors,
 
     NvtxScope scope("ffn");
 
-    const size_t num_token = input_tensors->at("ffn_input").shape[0];
-    const int    layer_id  = input_tensors->getVal<int>("layer_id");
-    // LOG(WARNING);
+    const size_t token_num  = input_tensors->at("ffn_input").shape[0];
+    const int    layer_id   = input_tensors->getVal<int>("layer_id");
+    const int    inter_size = weights->inter_size;
 
-    allocateBuffer(num_token, &weights->gating, &weights->intermediate);
+    allocateBuffer(token_num, inter_size, &weights->gating, &weights->intermediate);
 
     const T* ffn_input_data  = input_tensors->at("ffn_input").getPtr<T>();
     T*       ffn_output_data = output_tensors->at("ffn_output").getPtr<T>();
@@ -103,50 +104,50 @@ void LlamaFfnLayer<T>::forward(TensorMap*               output_tensors,
 
         const auto type = weights->is_fused_silu ? LlamaLinear<T>::kFusedSiluFfn : LlamaLinear<T>::kGemm;
 
-        linear_->forward(gating_buf_, ffn_input_data, num_token, weights->fused_gating_intermediate, type);
+        linear_->forward(gating_buf_, ffn_input_data, token_num, weights->fused_gating_intermediate, type);
         sync_check_cuda_error();
 
         if (!weights->is_fused_silu) {
-            activation(num_token, true);
+            activation(token_num, inter_size, true);
         }
 
-        count_and_fix(gating_buf_, num_token * weights->output.input_dims, Concat("w1_w3_silu", layer_id), 3);
+        count_and_fix(gating_buf_, token_num * weights->output.input_dims, Concat("w1_w3_silu", layer_id), 3);
     }
     else {
         {  // w1(x)
             NvtxScope scope("w1");
-            linear_->forward(gating_buf_, ffn_input_data, num_token, weights->gating, LlamaLinear<T>::kGemm, lora_mask);
+            linear_->forward(gating_buf_, ffn_input_data, token_num, weights->gating, LlamaLinear<T>::kGemm, lora_mask);
             sync_check_cuda_error();
         }
-        count_and_fix(gating_buf_, num_token * weights->gating.output_dims, Concat("w1", layer_id), 3);
+        count_and_fix(gating_buf_, token_num * weights->gating.output_dims, Concat("w1", layer_id), 3);
 
         {  // w3(x)
             NvtxScope scope("w3");
             linear_->forward(
-                inter_buf_, ffn_input_data, num_token, weights->intermediate, LlamaLinear<T>::kGemm, lora_mask);
+                inter_buf_, ffn_input_data, token_num, weights->intermediate, LlamaLinear<T>::kGemm, lora_mask);
             sync_check_cuda_error();
         }
-        count_and_fix(inter_buf_, num_token * weights->intermediate.output_dims, Concat("w3", layer_id), 3);
+        count_and_fix(inter_buf_, token_num * weights->intermediate.output_dims, Concat("w3", layer_id), 3);
 
         // silu(w1(x)) * w3(x)
-        activation(num_token, false);
+        activation(token_num, inter_size, false);
 
-        count_and_fix(gating_buf_, num_token * weights->output.input_dims, Concat("act", layer_id), 3);
+        count_and_fix(gating_buf_, token_num * weights->output.input_dims, Concat("act", layer_id), 3);
     }
 
     {  // w2(x)
         NvtxScope scope("w2");
-        const int pitch = (weights->fused_gating_intermediate.kernel && !weights->is_fused_silu) ? inter_size_ * 2 : 0;
+        const int pitch = (weights->fused_gating_intermediate.kernel && !weights->is_fused_silu) ? inter_size * 2 : 0;
         linear_->forward(
-            ffn_output_data, {gating_buf_, pitch}, num_token, weights->output, LlamaLinear<T>::kGemm, lora_mask);
+            ffn_output_data, {gating_buf_, pitch}, token_num, weights->output, LlamaLinear<T>::kGemm, lora_mask);
         sync_check_cuda_error();
     }
 
-    count_and_fix(ffn_output_data, num_token * weights->output.output_dims, Concat("w2", layer_id), 3);
+    count_and_fix(ffn_output_data, token_num * weights->output.output_dims, Concat("w2", layer_id), 3);
 
     if (all_reduce_ && tensor_para_.world_size_ > 1) {
         NcclGuard nccl_guard(tensor_para_, stream_);
-        ftNcclAllReduceSum(ffn_output_data, ffn_output_data, num_token * hidden_units_, tensor_para_, stream_);
+        ftNcclAllReduceSum(ffn_output_data, ffn_output_data, token_num * hidden_units_, tensor_para_, stream_);
         sync_check_cuda_error();
     }
 
diff --git a/src/turbomind/models/llama/LlamaFfnLayer.h b/src/turbomind/models/llama/LlamaFfnLayer.h
index 75ced5f9ac..2daca2cc95 100644
--- a/src/turbomind/models/llama/LlamaFfnLayer.h
+++ b/src/turbomind/models/llama/LlamaFfnLayer.h
@@ -19,12 +19,11 @@
 
 #pragma once
 
-#include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
 #include "src/turbomind/models/llama/LlamaLinear.h"
 #include "src/turbomind/models/llama/context.h"
-#include "src/turbomind/utils/custom_ar_comm.h"
+#include "src/turbomind/models/llama/llama_params.h"
+#include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/nccl_utils.h"
-#include <functional>
 
 namespace turbomind {
 
@@ -32,7 +31,6 @@ template<typename T>
 class LlamaFfnLayer {
 public:
     LlamaFfnLayer(const ModelParam& model, const NcclParam& tp, const Context<T>& ctx, bool all_reduce):
-        inter_size_(model.inter_size / tp.world_size_),
         hidden_units_(model.hidden_units),
         tensor_para_(tp),
         stream_(ctx.stream),
@@ -50,13 +48,12 @@ class LlamaFfnLayer {
     void forward(TensorMap* output_tensors, const TensorMap* input_tensors, const LlamaFfnWeight<T>* weights);
 
 private:
-    void allocateBuffer(size_t token_num, const LlamaDenseWeight<T>*, const LlamaDenseWeight<T>*);
+    void allocateBuffer(size_t token_num, int inter_size, const LlamaDenseWeight<T>*, const LlamaDenseWeight<T>*);
 
     void freeBuffer();
 
-    void activation(int token_num, bool is_chunked);
+    void activation(int token_num, int inter_size, bool is_chunked);
 
-    const size_t          inter_size_;
     const size_t          hidden_units_;
     const NcclParam       tensor_para_;
     cudaStream_t const    stream_;
diff --git a/src/turbomind/models/llama/llama_params.h b/src/turbomind/models/llama/llama_params.h
index 1c039ca66a..2ea63f0410 100644
--- a/src/turbomind/models/llama/llama_params.h
+++ b/src/turbomind/models/llama/llama_params.h
@@ -31,9 +31,11 @@ struct MoeParam {
         kNaive,
         kFused
     } method;
-    int expert_num;
-    int experts_per_token;
-    int inter_size;
+    int  expert_num;
+    int  experts_per_token;
+    int  inter_size;
+    bool norm_topk;
+    bool shared_gate;
 };
 
 struct AttentionParam {
diff --git a/src/turbomind/models/llama/moe_ffn_layer.cc b/src/turbomind/models/llama/moe_ffn_layer.cc
index def6b04abb..1ad76839d1 100644
--- a/src/turbomind/models/llama/moe_ffn_layer.cc
+++ b/src/turbomind/models/llama/moe_ffn_layer.cc
@@ -30,6 +30,7 @@ void MoeFfnLayer<T>::AllocateBuffer(size_t tokens, size_t padded)
         alloc(&f2n_, param_.experts_per_token * tokens);
         alloc(&en2f_, param_.experts_per_token * tokens);
         alloc(&scales_, param_.experts_per_token * tokens);
+        alloc(&shared_scales_, tokens);
         return (char*)alloc.ptr() - (char*)base;
     };
 
@@ -69,7 +70,7 @@ void MoeFfnLayer<T>::gate(float* logits, const T* input, int tokens, const Llama
                   getCudaDataType<T>(),
                   hidden_dim_,
                   &beta,
-                  logits_,
+                  logits,
                   CUDA_R_32F,
                   weight.output_dims,
                   CUDA_R_32F,
@@ -77,13 +78,13 @@ void MoeFfnLayer<T>::gate(float* logits, const T* input, int tokens, const Llama
 }
 
 template<class T>
-void MoeFfnLayer<T>::forward(T* inout, int tokens, int layer_id, const MoeFfnWeight<T>& moe)
+void MoeFfnLayer<T>::forward(T* output, const T* input, int tokens, int layer_id, const MoeFfnWeight<T>& moe)
 {
     const size_t padded = (tokens + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize;
 
     AllocateBuffer(tokens, padded);
 
-    gate(logits_, inout, tokens, moe.gate);
+    gate(logits_, input, tokens, moe.gate);
     sync_check_cuda_error();
 
     check_cuda_error(cudaMemsetAsync(accum_, 0, sizeof(int) * param_.expert_num * kMoeGateMaxTiles, stream_));
@@ -103,6 +104,7 @@ void MoeFfnLayer<T>::forward(T* inout, int tokens, int layer_id, const MoeFfnWei
                      padded,
                      param_.expert_num,
                      param_.experts_per_token,
+                     param_.norm_topk,
                      stream_);
     sync_check_cuda_error();
 
@@ -123,7 +125,7 @@ void MoeFfnLayer<T>::forward(T* inout, int tokens, int layer_id, const MoeFfnWei
 
     if (param_.method == MoeParam::kNaive) {
 
-        dispatchMoeGather(inout_buf_, inout, f2n_, tokens, param_.experts_per_token, hidden_dim_, stream_);
+        dispatchMoeGather(inout_buf_, input, f2n_, tokens, param_.experts_per_token, hidden_dim_, stream_);
         sync_check_cuda_error();
 
         check_cuda_error(
@@ -155,28 +157,8 @@ void MoeFfnLayer<T>::forward(T* inout, int tokens, int layer_id, const MoeFfnWei
 
         auto& block = moe.block;
 
-#if 0
-        FT_CHECK(!block.is_fused_silu);
-        for (int i = 0; i < param_.expert_num; ++i) {
-            if (size_t count = h_offsets_[i + 1] - h_offsets_[i]) {
-                cublas_->Gemm(CUBLAS_OP_T,  // (m, k)  W
-                              CUBLAS_OP_N,  // (k, n)  X
-                              inter_size_ * 2,
-                              count,
-                              hidden_dim_,
-                              moe.experts[i].fused_gating_intermediate.kernel,
-                              hidden_dim_,
-                              inout_buf_ + h_offsets_[i] * hidden_dim_,
-                              hidden_dim_,
-                              inter_buf_ + h_offsets_[i] * inter_size_ * 2,
-                              inter_size_ * 2);
-                sync_check_cuda_error();
-            }
-        }
-        auto mode = kCmpWrite;
-#else
         linear_->forward_moe(inter_buf_,
-                             {inout, (int)hidden_dim_},
+                             {input, (int)hidden_dim_},
                              f2n_,
                              offsets_,
                              tokens * param_.experts_per_token,
@@ -185,7 +167,6 @@ void MoeFfnLayer<T>::forward(T* inout, int tokens, int layer_id, const MoeFfnWei
                              context_.get());
         sync_check_cuda_error();
         auto mode = kCmpRead;
-#endif
 
         // if (tensor_para_.rank_ == 0) {
         //     Compare(inter_buf_,  //
@@ -205,25 +186,6 @@ void MoeFfnLayer<T>::forward(T* inout, int tokens, int layer_id, const MoeFfnWei
             sync_check_cuda_error();
         }
 
-#if 0
-        for (int i = 0; i < param_.expert_num; ++i) {
-            if (size_t count = h_offsets_[i + 1] - h_offsets_[i]) {
-                cublas_->Gemm(CUBLAS_OP_T,  // (m, k) W
-                              CUBLAS_OP_N,  // (k, n) X
-                              hidden_dim_,
-                              count,
-                              inter_size_,
-                              moe.experts[i].output.kernel,
-                              inter_size_,
-                              inter_buf_ + h_offsets_[i] * inter_size_ * 2,
-                              inter_size_ * 2,
-                              inout_buf_ + h_offsets_[i] * hidden_dim_,
-                              hidden_dim_);
-                sync_check_cuda_error();
-            }
-        }
-        auto mode1 = kCmpWrite;
-#else
         linear_->forward_moe(inout_buf_,
                              {inter_buf_, block.is_fused_silu ? (int)inter_size_ : (int)inter_size_ * 2},
                              nullptr,
@@ -234,7 +196,6 @@ void MoeFfnLayer<T>::forward(T* inout, int tokens, int layer_id, const MoeFfnWei
                              context_.get());
         sync_check_cuda_error();
         auto mode1 = kCmpRead;
-#endif
 
         // if (tensor_para_.rank_ == 0) {
         //     Compare(inter_buf_2_,  //
@@ -250,18 +211,29 @@ void MoeFfnLayer<T>::forward(T* inout, int tokens, int layer_id, const MoeFfnWei
         // }
     }
 
-    invokeMoeReduce(inout, inout_buf_, scales_, en2f_, tokens, param_.experts_per_token, hidden_dim_, stream_);
+    if (moe.shared_gate.kernel) {
+        gate(shared_scales_, input, tokens, moe.shared_gate);
+    }
+}
+
+template<class T>
+void MoeFfnLayer<T>::reduce(T* output, int tokens, const MoeFfnWeight<T>& moe)
+{
+    invokeMoeReduce(output,
+                    inout_buf_,
+                    scales_,
+                    en2f_,
+                    moe.shared_gate.kernel ? shared_scales_ : nullptr,
+                    tokens,
+                    param_.experts_per_token,
+                    hidden_dim_,
+                    stream_);
     sync_check_cuda_error();
 
     if (tensor_para_.world_size_ > 1) {
-        ftNcclAllReduceSum(inout, inout, tokens * hidden_dim_, tensor_para_, stream_);
+        ftNcclAllReduceSum(output, output, tokens * hidden_dim_, tensor_para_, stream_);
         sync_check_cuda_error();
     }
-
-    // if (tensor_para_.rank_ == 0) {
-    //     check_cuda_error(cudaStreamSynchronize(stream_));
-    //     std::abort();
-    // }
 }
 
 template<class T>
diff --git a/src/turbomind/models/llama/moe_ffn_layer.h b/src/turbomind/models/llama/moe_ffn_layer.h
index ef65aaa464..0f1713f7b5 100644
--- a/src/turbomind/models/llama/moe_ffn_layer.h
+++ b/src/turbomind/models/llama/moe_ffn_layer.h
@@ -51,7 +51,9 @@ class MoeFfnLayer {
         FreeBuffer();
     }
 
-    void forward(T* inout, int tokens, int layer_id, const MoeFfnWeight<T>& moe);
+    void forward(T* output, const T* input, int tokens, int layer_id, const MoeFfnWeight<T>& moe);
+
+    void reduce(T* output, int tokens, const MoeFfnWeight<T>& moe);
 
     void gate(float* logits, const T* input, int tokens, const LlamaDenseWeight<T>& weight);
 
@@ -85,6 +87,8 @@ class MoeFfnLayer {
     int*   en2f_{};
     float* scales_{};
 
+    float* shared_scales_{};
+
     int* accum_{};
     int* offsets_{};
 };
diff --git a/src/turbomind/models/llama/unified_decoder.cc b/src/turbomind/models/llama/unified_decoder.cc
index 68392215f6..28e8b5f649 100644
--- a/src/turbomind/models/llama/unified_decoder.cc
+++ b/src/turbomind/models/llama/unified_decoder.cc
@@ -26,9 +26,15 @@ UnifiedDecoder<T>::UnifiedDecoder(const ModelParam&     model,
     dtype_(getTensorType<T>())
 {
 
-    attn_layer_    = std::make_unique<UnifiedAttentionLayer<T>>(model, attn, lora, tp, ctx);
-    ffn_layer_     = std::make_unique<LlamaFfnLayer<T>>(model, tp, ctx, true);
-    moe_ffn_layer_ = std::make_unique<MoeFfnLayer<T>>(model, moe, tp, ctx);
+    attn_layer_ = std::make_unique<UnifiedAttentionLayer<T>>(model, attn, lora, tp, ctx);
+
+    if (moe.expert_num) {
+        moe_ffn_layer_ = std::make_unique<MoeFfnLayer<T>>(model, moe, tp, ctx);
+    }
+
+    if (model.inter_size) {
+        ffn_layer_ = std::make_unique<LlamaFfnLayer<T>>(model, tp, ctx, !moe_ffn_layer_);
+    }
 
     check_cuda_error(cudaEventCreateWithFlags(&ev_h_cu_x_, cudaEventDisableTiming));
 }
@@ -190,9 +196,10 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
         /// feed-forward network
 
         if (!weights->at(layer)->moe_weights.experts.empty()) {
-            moe_ffn_layer_->forward(decoder_output, token_num, layer, weights->at(layer)->moe_weights);
+            moe_ffn_layer_->forward(nullptr, decoder_output, token_num, layer, weights->at(layer)->moe_weights);
         }
-        else {
+
+        if (ffn_layer_) {
             int       layer_id = layer;  // int is needed
             TensorMap ffn_inputs{{"ffn_input", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, decoder_output}},
                                  {"layer_id", {MEMORY_CPU, TYPE_INT32, {1}, &layer_id}}};
@@ -203,6 +210,10 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
             ffn_layer_->forward(&ffn_outputs, &ffn_inputs, &weights->at(layer)->ffn_weights);
         }
 
+        if (!weights->at(layer)->moe_weights.experts.empty()) {
+            moe_ffn_layer_->reduce(decoder_output, token_num, weights->at(layer)->moe_weights);
+        }
+
         count_and_fix(decoder_output, token_num * hidden_units_, Concat("ffn_block", layer), 2);
 
         const bool is_last_layer = layer == layer_num_ - 1;
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
index 8db13652f5..38552be0cf 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -301,6 +301,8 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
     moe_param_.expert_num        = model_reader["expert_num"].as<int>(0);
     moe_param_.experts_per_token = model_reader["experts_per_token"].as<int>(0);
     moe_param_.inter_size        = model_reader["expert_inter_size"].as<int>(0);
+    moe_param_.shared_gate       = model_reader["moe_shared_gate"].as<int>(0);
+    moe_param_.norm_topk         = model_reader["moe_norm_topk"].as<bool>(false);
 
     handleMissingParams();
 

From adf7c361531b45a032e58bc6293c4df33c182fbe Mon Sep 17 00:00:00 2001
From: AllentDan <41138331+AllentDan@users.noreply.github.com>
Date: Wed, 13 Nov 2024 12:21:28 +0800
Subject: [PATCH 05/40] Support mixtral moe AWQ quantization. (#2725)

* moe-awq

* skip gate

* add skipped_modules

* fix search-scale

* update autotest
---
 autotest/utils/quantization_utils.py          |  2 +-
 docs/en/quantization/w4a16.md                 |  1 -
 docs/zh_cn/quantization/w4a16.md              |  1 -
 lmdeploy/cli/utils.py                         |  4 +-
 lmdeploy/lite/apis/auto_awq.py                | 10 +---
 lmdeploy/lite/apis/calibrate.py               | 48 +++++++++++++++++--
 .../lite/quantization/activation/observer.py  |  5 +-
 lmdeploy/lite/quantization/awq.py             | 30 ++++++++----
 lmdeploy/lite/quantization/calibration.py     | 35 +-------------
 9 files changed, 75 insertions(+), 61 deletions(-)

diff --git a/autotest/utils/quantization_utils.py b/autotest/utils/quantization_utils.py
index 752168958a..bc09ed9a4c 100644
--- a/autotest/utils/quantization_utils.py
+++ b/autotest/utils/quantization_utils.py
@@ -40,7 +40,7 @@ def quantization(config,
             now the type is ' + quantization_type
 
     if 'llama-3' in origin_model_name.lower():
-        quantization_cmd += ' --search-scale True'
+        quantization_cmd += ' --search-scale'
 
     if not is_bf16_supported():
         quantization_cmd += ' --batch-size 8'
diff --git a/docs/en/quantization/w4a16.md b/docs/en/quantization/w4a16.md
index 32dfe18d80..0aa1e17a5b 100644
--- a/docs/en/quantization/w4a16.md
+++ b/docs/en/quantization/w4a16.md
@@ -39,7 +39,6 @@ lmdeploy lite auto_awq \
   --w-bits 4 \
   --w-group-size 128 \
   --batch-size 1 \
-  --search-scale False \
   --work-dir $WORK_DIR
 ```
 
diff --git a/docs/zh_cn/quantization/w4a16.md b/docs/zh_cn/quantization/w4a16.md
index d50e464af3..d69a8a23d2 100644
--- a/docs/zh_cn/quantization/w4a16.md
+++ b/docs/zh_cn/quantization/w4a16.md
@@ -39,7 +39,6 @@ lmdeploy lite auto_awq \
   --w-bits 4 \
   --w-group-size 128 \
   --batch-size 1 \
-  --search-scale False \
   --work-dir $WORK_DIR
 ```
 
diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py
index ad7a058c8f..85784a58f5 100644
--- a/lmdeploy/cli/utils.py
+++ b/lmdeploy/cli/utils.py
@@ -358,10 +358,10 @@ def calib_search_scale(parser):
 
         return parser.add_argument(
             '--search-scale',
-            type=bool,
+            action='store_true',
             default=False,
             help=\
-            'Whether search scale ratio. Default to False, which means only smooth quant with 0.5 ratio will be applied'  # noqa
+            'Whether search scale ratio. Default to be disabled, which means only smooth quant with 0.5 ratio will be applied'  # noqa
         )
 
     @staticmethod
diff --git a/lmdeploy/lite/apis/auto_awq.py b/lmdeploy/lite/apis/auto_awq.py
index d7d6a5560e..c41b28fd6e 100644
--- a/lmdeploy/lite/apis/auto_awq.py
+++ b/lmdeploy/lite/apis/auto_awq.py
@@ -101,7 +101,7 @@ def auto_awq(model: str,
     layer_type = LAYER_TYPE_MAP[type(model).__name__]
     fc2fcs = FC_FCS_MAP[layer_type]
     norm2fcs = NORM_FCS_MAP[layer_type]
-    input_stats = torch.load(work_dir / 'inputs_stats.pth')
+    input_stats = torch.load(osp.join(work_dir, 'inputs_stats.pth'))
     layers = collect_target_modules(model, layer_type)
     fcs = {}
     for l_name, layer in layers.items():
@@ -117,13 +117,7 @@ def auto_awq(model: str,
         act_scales = input_stats['absmax']
         smooth_layers(layers, fc2fcs, norm2fcs, act_scales, w_group_size,
                       device)
-    quant_weights(model,
-                  fcs,
-                  w_bits,
-                  w_sym,
-                  w_group_size,
-                  device,
-                  skip_if_contains='lora')  # TODO quant lora weight
+    quant_weights(model, fcs, w_bits, w_sym, w_group_size, device)
     quantization_config = dict(quant_method='awq',
                                version='gemm',
                                bits=w_bits,
diff --git a/lmdeploy/lite/apis/calibrate.py b/lmdeploy/lite/apis/calibrate.py
index b2fd8e3883..65ecd765c7 100644
--- a/lmdeploy/lite/apis/calibrate.py
+++ b/lmdeploy/lite/apis/calibrate.py
@@ -24,7 +24,8 @@
     'MGMLlamaForCausalLM': 'LlamaDecoderLayer',  # mini gemini
     'InternLMXComposer2ForCausalLM': 'InternLM2DecoderLayer',
     'Phi3ForCausalLM': 'Phi3DecoderLayer',
-    'ChatGLMForConditionalGeneration': 'GLMBlock'
+    'ChatGLMForConditionalGeneration': 'GLMBlock',
+    'MixtralForCausalLM': 'MixtralDecoderLayer',
 }
 
 NORM_TYPE_MAP = {
@@ -39,7 +40,8 @@
     'MGMLlamaForCausalLM': 'LlamaRMSNorm',  # mini gemini
     'InternLMXComposer2ForCausalLM': 'InternLM2RMSNorm',
     'Phi3ForCausalLM': 'Phi3RMSNorm',
-    'ChatGLMForConditionalGeneration': 'RMSNorm'
+    'ChatGLMForConditionalGeneration': 'RMSNorm',
+    'MixtralForCausalLM': 'MixtralRMSNorm',
 }
 
 HEAD_NAME_MAP = {
@@ -54,7 +56,8 @@
     'MGMLlamaForCausalLM': 'lm_head',  # mini gemini
     'InternLMXComposer2ForCausalLM': 'output',
     'Phi3ForCausalLM': 'lm_head',
-    'ChatGLMForConditionalGeneration': 'output_layer'
+    'ChatGLMForConditionalGeneration': 'output_layer',
+    'MixtralForCausalLM': 'lm_head',
 }
 
 
@@ -150,6 +153,42 @@ def _get_non_default_generation_parameters(self):
             PretrainedConfig._get_non_default_generation_parameters = _get_non_default_generation_parameters  # noqa
 
 
+def update_moe_mapping(model, model_type):
+    """Update moe mapping."""
+    from lmdeploy.lite.quantization.awq import FC_FCS_MAP, NORM_FCS_MAP
+
+    # get experts num
+    num_experts = 0
+    for n, m in model.named_modules():
+        if type(m).__name__ == LAYER_TYPE_MAP[model_type]:
+            fc2fcs = FC_FCS_MAP[LAYER_TYPE_MAP[model_type]]
+            for k, v in fc2fcs.items():
+                if '{i}' in k:
+                    break
+            num_experts = len(m.get_submodule(k.split('.{i}')[0]))
+            break
+
+    # update FC_FCS_MAP
+    updated_fc2fcs = dict()
+    for prev_fc, post_fc in fc2fcs.items():
+        if '{i}' in prev_fc:
+            for i in range(num_experts):
+                updated_fc2fcs.update(
+                    {prev_fc.format(i=i): [v.format(i=i) for v in post_fc]})
+        else:
+            updated_fc2fcs.update({prev_fc: post_fc})
+    FC_FCS_MAP[LAYER_TYPE_MAP[model_type]] = updated_fc2fcs
+    # update NORM_FCS_MAP
+    norm2fcs = NORM_FCS_MAP[LAYER_TYPE_MAP[model_type]]
+    updated_norm2fcs = dict()
+    for norm, fc in norm2fcs.items():
+        updated_norm2fcs.update({
+            norm:
+            list(set([v.format(i=i) for v in fc for i in range(num_experts)]))
+        })
+    NORM_FCS_MAP[LAYER_TYPE_MAP[model_type]] = updated_norm2fcs
+
+
 def calibrate(model: str,
               calib_dataset: str = 'ptb',
               calib_samples: int = 128,
@@ -216,6 +255,9 @@ def calibrate(model: str,
             f'not supported. The supported model types are '
             f"{', '.join(LAYER_TYPE_MAP.keys())}.")
 
+    if model_type in ['MixtralForCausalLM']:
+        update_moe_mapping(model, model_type)
+
     if model_type == 'QWenLMHeadModel':
         try:
             import flash_attn  # noqa: F401
diff --git a/lmdeploy/lite/quantization/activation/observer.py b/lmdeploy/lite/quantization/activation/observer.py
index c66bdda6f4..9138c6ccce 100644
--- a/lmdeploy/lite/quantization/activation/observer.py
+++ b/lmdeploy/lite/quantization/activation/observer.py
@@ -99,11 +99,10 @@ def observe(self, x: torch.Tensor, save_input: bool = False) -> None:
         Args:
             x : Input tensor
         """
+        assert torch.isnan(x).sum() == 0
         if self.observed:
             return
-        if len(x.shape) != 3:
-            return
-        assert x.size(2) == self.dim
+        assert x.size(-1) == self.dim
         cur_val = x.flatten(0, 1)
         cur_max = cur_val.max(0)[0].cpu()
         cur_min = cur_val.min(0)[0].cpu()
diff --git a/lmdeploy/lite/quantization/awq.py b/lmdeploy/lite/quantization/awq.py
index fad7c2ef30..068ad9357e 100644
--- a/lmdeploy/lite/quantization/awq.py
+++ b/lmdeploy/lite/quantization/awq.py
@@ -39,6 +39,12 @@
     'GLMBlock': {
         'input_layernorm': ['self_attention.query_key_value'],
         'post_attention_layernorm': ['mlp.dense_h_to_4h']
+    },
+    'MixtralDecoderLayer': {
+        'input_layernorm':
+        ['self_attn.k_proj', 'self_attn.q_proj', 'self_attn.v_proj'],
+        'post_attention_layernorm':
+        ['block_sparse_moe.experts.{i}.w1', 'block_sparse_moe.experts.{i}.w3']
     }
 }
 
@@ -73,9 +79,23 @@
     'GLMBlock': {
         # 'self_attention.query_key_value': ['self_attention.dense']
         # 'mlp.dense_h_to_4h': ['mlp.dense_4h_to_h']
+    },
+    'MixtralDecoderLayer': {
+        'self_attn.v_proj': ['self_attn.o_proj'],
+        'block_sparse_moe.experts.{i}.w3': ['block_sparse_moe.experts.{i}.w2']
     }
 }
 
+SKIPPED_MODULE = ['lora', 'block_sparse_moe.gate']
+
+
+def skipped_module(name: str):
+    """Whether the module should be skipped from quantization."""
+    for m in SKIPPED_MODULE:
+        if m in name:
+            return True
+    return False
+
 
 @torch.no_grad()
 def get_weight_scale(weight, q_group_size=-1):
@@ -225,13 +245,7 @@ def check_awq_supported(layer_type):
         raise NotImplementedError
 
 
-def quant_weights(model,
-                  fcs,
-                  bits,
-                  symmetry,
-                  group_size=-1,
-                  device='cuda',
-                  skip_if_contains: str = None):
+def quant_weights(model, fcs, bits, symmetry, group_size=-1, device='cuda'):
     """Quantize the weights of the target model's linear layers."""
     from lmdeploy.lite.quantization import WeightQuantizer
     from lmdeploy.lite.quantization.modules import WeightOnlyQLinear
@@ -241,7 +255,7 @@ def quant_weights(model,
         parent_name, _, child_name = name.rpartition('.')
         parent = model.get_submodule(parent_name)
         pack_or_skip = 'packed'
-        if skip_if_contains and skip_if_contains in child_name:
+        if skipped_module(name):
             q_linear = fc
             pack_or_skip = 'skipped'
         else:
diff --git a/lmdeploy/lite/quantization/calibration.py b/lmdeploy/lite/quantization/calibration.py
index 77ff74e234..e590f1a4eb 100644
--- a/lmdeploy/lite/quantization/calibration.py
+++ b/lmdeploy/lite/quantization/calibration.py
@@ -6,8 +6,7 @@
 from torch import nn
 from transformers import PreTrainedTokenizer
 
-from lmdeploy.lite.quantization.activation import (ActivationObserver,
-                                                   KVCacheObserver)
+from lmdeploy.lite.quantization.activation import ActivationObserver
 from lmdeploy.lite.quantization.awq import FC_FCS_MAP, NORM_FCS_MAP
 from lmdeploy.lite.utils import (bimap_name_mod, collect_target_modules,
                                  concat_decoder_layer_outputs,
@@ -27,8 +26,6 @@ class CalibrationContext():
 
     inp_obs_group = 'inputs'
     out_obs_group = 'outputs'
-    key_obs_group = 'keys'
-    value_obs_group = 'values'
 
     def __init__(self,
                  model: nn.Module,
@@ -75,7 +72,6 @@ def __init__(self,
         self._init_input_observers(self.name2fc)
         self._init_output_observers(self.name2norm)
         self._init_output_observers(self.name2fc)
-        self._init_kv_observers(self.name2layer)
 
         self.device = device
 
@@ -102,14 +98,6 @@ def _init_output_observers(self, name2mod):
             obs = ActivationObserver(mod.weight.size(0))
             obs.global_available(name, group=self.out_obs_group)
 
-    def _init_kv_observers(self, name2mod):
-        """Initialize KV observers for given modules."""
-        for name in name2mod.keys():
-            k_obs = KVCacheObserver(self.num_kv_heads, self.head_dim)
-            v_obs = KVCacheObserver(self.num_kv_heads, self.head_dim)
-            k_obs.global_available(name, group=self.key_obs_group)
-            v_obs.global_available(name, group=self.value_obs_group)
-
     def _insert_input_observers(self):
         """Insert input observers into the target modules.
 
@@ -221,27 +209,6 @@ def collect_outputs_stats(self):
             outputs_stats['absmean'][name] = obs.absmean_val
         return outputs_stats
 
-    def collect_kv_stats(self):
-        """Collect statistics (min, max, absmax values) of the observed keys
-        and values.
-
-        Returns a tuple of two dictionaries with these collected stats.
-        """
-        key_stats = {'max': {}, 'min': {}, 'absmax': {}}
-        obs_group = KVCacheObserver.find_group(self.key_obs_group)
-        for name, obs in obs_group.items():
-            key_stats['max'][name] = obs.max_val
-            key_stats['min'][name] = obs.min_val
-            key_stats['absmax'][name] = obs.absmax_val
-
-        value_stats = {'max': {}, 'min': {}, 'absmax': {}}
-        obs_group = KVCacheObserver.find_group(self.value_obs_group)
-        for name, obs in obs_group.items():
-            value_stats['max'][name] = obs.max_val
-            value_stats['min'][name] = obs.min_val
-            value_stats['absmax'][name] = obs.absmax_val
-        return key_stats, value_stats
-
     def export(self, out_dir):
         """Export the calibration statistics (inputs, outputs, keys and values)
         to specified directory.

From 9f6ff9b2b63d4883338ee6f7ed2b03e7a932729e Mon Sep 17 00:00:00 2001
From: Chen Xin <irexyc@gmail.com>
Date: Wed, 13 Nov 2024 15:37:54 +0800
Subject: [PATCH 06/40] Check server input (#2719)

* validate server input

* remove unused

* addl ToolMessage

* remove check

* remove check

* remove check

* update

* update
---
 lmdeploy/model.py                 | 24 +++++++++++++++++++-----
 lmdeploy/serve/openai/protocol.py |  3 +--
 2 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index 2b3a0a4e1d..1872502334 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -3,7 +3,7 @@
 import json
 import uuid
 from abc import abstractmethod
-from typing import List, Literal, Optional
+from typing import List, Literal, Optional, Union
 
 from mmengine import Registry
 
@@ -18,6 +18,20 @@ def random_uuid() -> str:
     return str(uuid.uuid4().hex)
 
 
+def get_text(content: Union[str, List[dict]]):
+    """Within the OpenAI API, the content field may be specified as either a
+    string or a list of ChatCompletionContentPartTextParam (defined in openai).
+
+    When a list is provided, lmdeploy selects the first element to incorporate
+    into the chat template, as the manner in which OpenAI processes lists is
+    not explicitly defined.
+    """
+
+    if isinstance(content, str):
+        return content
+    return content[0]['text']
+
+
 @dataclasses.dataclass
 class ChatTemplateConfig:
     """Parameters for chat template.
@@ -219,7 +233,7 @@ def messages2prompt(self, messages, sequence_start=True, **kwargs):
                 ret += f'{self.system}{self.meta_instruction}{self.eosys}'
         for message in messages:
             role = message['role']
-            content = message['content']
+            content = get_text(message['content'])
             ret += f'{box_map[role]}{content}{eox_map[role]}'
         if len(messages) and messages[-1]['role'] == 'assistant':
             return ret[:-len(eox_map['assistant'])]  # prefix of response
@@ -509,7 +523,7 @@ def messages2prompt(self,
             messages.insert(insert_index, tools_prompt)
         for message in messages:
             role = message['role']
-            content = message['content']
+            content = get_text(message['content'])
             if role == 'assistant' and message.get('tool_calls',
                                                    None) is not None:
                 for tool_call in message['tool_calls']:
@@ -862,7 +876,7 @@ def messages2prompt(self,
                     ret += f'{self.system}{self.knowledge}{self.tools}{tool_prompt}{self.eotools}{self.meta_instruction}{self.eosys}'
         for message in messages:
             role = message['role']
-            content = message['content']
+            content = get_text(message['content'])
             if role == 'assistant' and ('<|python_tag|>' in content
                                         or '</function>' in content):
                 ret += f'{box_map[role]}{content}<|eom_id|>'
@@ -1038,7 +1052,7 @@ def messages2prompt(self, messages, sequence_start=True, **kwargs):
         count = 0
         for message in messages:
             role = message['role']
-            content = message['content']
+            content = get_text(message['content'])
             if role == 'user':
                 count += 1
                 ret += f'[Round {count}]\n\n'
diff --git a/lmdeploy/serve/openai/protocol.py b/lmdeploy/serve/openai/protocol.py
index d4bf8ed315..2b9d39c7b7 100644
--- a/lmdeploy/serve/openai/protocol.py
+++ b/lmdeploy/serve/openai/protocol.py
@@ -114,7 +114,7 @@ class ChatCompletionRequest(BaseModel):
     temperature: Optional[float] = 0.7
     top_p: Optional[float] = 1.0
     tools: Optional[List[Tool]] = Field(default=None, examples=[None])
-    tool_choice: Union[ToolChoice, Literal['auto', 'required','none']] = Field(default='auto', examples=['none'])  # noqa
+    tool_choice: Union[ToolChoice, Literal['auto', 'required', 'none']] = Field(default='auto', examples=['none'])  # noqa
     logprobs: Optional[bool] = False
     top_logprobs: Optional[int] = None
     n: Optional[int] = 1
@@ -242,7 +242,6 @@ class CompletionRequest(BaseModel):
     stream_options: Optional[StreamOptions] = Field(default=None,
                                                     examples=[None])
     top_p: Optional[float] = 1.0
-    logprobs: Optional[int] = None
     echo: Optional[bool] = False
     presence_penalty: Optional[float] = 0.0
     frequency_penalty: Optional[float] = 0.0

From 20544d3c9020d2000bf56fb33ee6e32bd017c8fc Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Wed, 13 Nov 2024 17:38:46 +0800
Subject: [PATCH 07/40] fix issue that mono-internvl failed to fallback pytorch
 engine (#2744)

---
 lmdeploy/turbomind/supported_models.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py
index f6772fddd5..bb3533254b 100644
--- a/lmdeploy/turbomind/supported_models.py
+++ b/lmdeploy/turbomind/supported_models.py
@@ -96,10 +96,12 @@ def _is_head_dim_supported(cfg):
                     # glm-4v-9b not supported
                     support_by_turbomind = False
             elif arch == 'InternVLChatModel':
-                support_by_turbomind = _is_head_dim_supported(cfg.llm_config)
+                llm_arch = cfg.llm_config.architectures[0]
+                support_by_turbomind = (llm_arch in SUPPORTED_ARCHS and
+                                        _is_head_dim_supported(cfg.llm_config))
             elif arch == 'LlavaForConditionalGeneration':
-                sub_arch = cfg.text_config.architectures[0]
-                if sub_arch in ['Qwen2ForCausalLM', 'LlamaForCausalLM']:
+                llm_arch = cfg.text_config.architectures[0]
+                if llm_arch in ['Qwen2ForCausalLM', 'LlamaForCausalLM']:
                     support_by_turbomind = _is_head_dim_supported(
                         cfg.text_config)
 

From 72503185fa121a2ce2305f7dca56f9136152db33 Mon Sep 17 00:00:00 2001
From: tangzhiyi11 <tangzhiyi11@users.noreply.github.com>
Date: Wed, 13 Nov 2024 18:39:59 +0800
Subject: [PATCH 08/40] optimize dlinfer moe (#2741)

---
 lmdeploy/pytorch/backends/dlinfer/moe.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/lmdeploy/pytorch/backends/dlinfer/moe.py b/lmdeploy/pytorch/backends/dlinfer/moe.py
index 90f6335ecb..6ada730fbe 100644
--- a/lmdeploy/pytorch/backends/dlinfer/moe.py
+++ b/lmdeploy/pytorch/backends/dlinfer/moe.py
@@ -19,9 +19,8 @@ def __init__(self, top_k: int, dim: int = -1):
 
     def forward(self, x: torch.Tensor):
         routing_weights, selected_experts = moe_gating_topk_softmax(
-            x, self.top_k)
-        return routing_weights.to(torch.float32), selected_experts.to(
-            torch.int64)
+            x.to(torch.float32), self.top_k)
+        return routing_weights, selected_experts
 
 
 class DlinferSoftmaxTopKBuilder(SoftmaxTopKBuilder):

From a21def9e7722058689e8a4eeeb1c957c2a948b88 Mon Sep 17 00:00:00 2001
From: RunningLeon <mnsheng@yeah.net>
Date: Thu, 14 Nov 2024 11:34:12 +0800
Subject: [PATCH 09/40] Support chemvlm (#2738)

* update to support chemvlm

* update docs

* add ut
---
 README.md                                     |  1 +
 README_ja.md                                  |  2 ++
 README_zh-CN.md                               |  1 +
 docs/en/supported_models/supported_models.md  |  2 ++
 .../supported_models/supported_models.md      |  2 ++
 lmdeploy/model.py                             |  3 +++
 lmdeploy/vl/model/internvl.py                 | 12 +++++++++---
 tests/test_lmdeploy/test_model.py             | 19 +++++++++++++++++++
 8 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index efbb87a22e..5b6ad47bdf 100644
--- a/README.md
+++ b/README.md
@@ -157,6 +157,7 @@ For detailed inference benchmarks in more devices and more settings, please refe
   <li>InternVL-Chat (v1.1-v1.5)</li>
   <li>InternVL2 (1B-76B)</li>
   <li>Mono-InternVL (2B)</li>
+  <li>ChemVLM (8B-26B)</li>
   <li>MiniGeminiLlama (7B)</li>
   <li>CogVLM-Chat (17B)</li>
   <li>CogVLM2-Chat (19B)</li>
diff --git a/README_ja.md b/README_ja.md
index df4647d868..bdd9ddb02d 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -152,6 +152,8 @@ LMDeploy TurboMindエンジンは卓越した推論能力を持ち、さまざ
   <li>DeepSeek-VL (7B)</li>
   <li>InternVL-Chat (v1.1-v1.5)</li>
   <li>InternVL2 (1B-76B)</li>
+  <li>Mono-InternVL (2B)</li>
+  <li>ChemVLM (8B-26B)</li>
   <li>MiniGeminiLlama (7B)</li>
   <li>CogVLM-Chat (17B)</li>
   <li>CogVLM2-Chat (19B)</li>
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 477fed6f79..550922d081 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -158,6 +158,7 @@ LMDeploy TurboMind 引擎拥有卓越的推理能力，在各种规模的模型
   <li>InternVL-Chat (v1.1-v1.5)</li>
   <li>InternVL2 (1B-76B)</li>
   <li>Mono-InternVL (2B)</li>
+  <li>ChemVLM (8B-26B)</li>
   <li>MiniGeminiLlama (7B)</li>
   <li>CogVLM-Chat (17B)</li>
   <li>CogVLM2-Chat (19B)</li>
diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
index 371e4968e0..90ca90388b 100644
--- a/docs/en/supported_models/supported_models.md
+++ b/docs/en/supported_models/supported_models.md
@@ -30,6 +30,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |    LLaVA(1.5,1.6)     |   7B - 34B   | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |       InternVL        | v1.1 - v1.5  | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |       InternVL2       | 2B, 8B - 76B | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|        ChemVLM        |   8B - 26B   | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 | MiniCPM-Llama3-V-2_5  |      -       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |     MiniCPM-V-2_6     |      -       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |    MiniGeminiLlama    |      7B      | MLLM |    Yes    |    -    |    -    |  Yes  |
@@ -81,6 +82,7 @@ The TurboMind engine doesn't support window attention. Therefore, for models tha
 | InternVL(v1.5) |   2B-26B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
 |   InternVL2    |   1B-40B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
 | Mono-InternVL  |     2B      | MLLM |   Yes\*   |   Yes   |   Yes   |  No  |   -   |
+|    ChemVLM     |   8B-26B    | MLLM |    Yes    |   Yes   |   No    |  No  |   -   |
 |     Gemma2     |   9B-27B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
 |      GLM4      |     9B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |     GLM-4V     |     9B      | MLLM |    Yes    |   Yes   |   Yes   |  No  |  No   |
diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
index 7d59a59899..fecfdee200 100644
--- a/docs/zh_cn/supported_models/supported_models.md
+++ b/docs/zh_cn/supported_models/supported_models.md
@@ -30,6 +30,7 @@
 |    LLaVA(1.5,1.6)     |   7B - 34B   | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |       InternVL        | v1.1 - v1.5  | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |       InternVL2       | 2B, 8B - 76B | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|        ChemVLM        |   8B - 26B   | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 | MiniCPM-Llama3-V-2_5  |      -       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |     MiniCPM-V-2_6     |      -       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |    MiniGeminiLlama    |      7B      | MLLM |    Yes    |    -    |    -    |  Yes  |
@@ -81,6 +82,7 @@ turbomind 引擎不支持 window attention。所以，对于应用了 window att
 | InternVL(v1.5) |   2B-26B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
 |   InternVL2    |   1B-40B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
 | Mono-InternVL  |     2B      | MLLM |   Yes\*   |   Yes   |   Yes   |  No  |   -   |
+|    ChemVLM     |   8B-26B    | MLLM |    Yes    |   Yes   |   No    |  No  |   -   |
 |     Gemma2     |   9B-27B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
 |      GLM4      |     9B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |     GLM-4V     |     9B      | MLLM |    Yes    |   Yes   |   Yes   |  No  |  No   |
diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index 1872502334..db864a8344 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -565,6 +565,9 @@ def match(cls, model_path: str) -> Optional[str]:
                 return None
             return 'internvl-internlm2'
 
+        if 'chemvlm' in path:
+            return 'internvl-internlm2'
+
 
 @MODELS.register_module(name='internvl2-internlm2')
 class InternVL2InternLM2(InternLM2Chat7B):
diff --git a/lmdeploy/vl/model/internvl.py b/lmdeploy/vl/model/internvl.py
index d85fe30939..fa67192f11 100644
--- a/lmdeploy/vl/model/internvl.py
+++ b/lmdeploy/vl/model/internvl.py
@@ -108,8 +108,15 @@ def build_model(self):
         # avoid randomness in inference.
         self.model = model.eval()
         self.config = config
+        dynamic_image_size = getattr(self.config, 'dynamic_image_size', False)
+        image_processor = None
+        try:
+            image_processor = CLIPImageProcessor.from_pretrained(
+                self.model_path)
+        except OSError:
+            pass
 
-        if getattr(self.config, 'dynamic_image_size', False):
+        if dynamic_image_size or image_processor is None:
             logger.info('using InternVL-Chat-V1-5 vision preprocess')
             MEAN = (0.485, 0.456, 0.406)
             STD = (0.229, 0.224, 0.225)
@@ -126,8 +133,7 @@ def build_model(self):
             ])
             self._forward_func = self._forward_v1_5
         else:
-            self.image_processor = CLIPImageProcessor.from_pretrained(
-                self.model_path)
+            self.image_processor = image_processor
             self._forward_func = self._forward
 
     def _preprocess_v1_5(self, images: List[Image], params: List[Dict] = None):
diff --git a/tests/test_lmdeploy/test_model.py b/tests/test_lmdeploy/test_model.py
index a38971e4d0..7e3e71793d 100644
--- a/tests/test_lmdeploy/test_model.py
+++ b/tests/test_lmdeploy/test_model.py
@@ -475,6 +475,25 @@ def test_internvl2():
     assert res == expected
 
 
+def test_chemvlm():
+    deduced_name = best_match_model('AI4Chem/ChemVLM-8B')
+
+    assert deduced_name == 'internvl-internlm2'
+    model = MODELS.get(deduced_name)()
+    messages = [{
+        'role': 'user',
+        'content': 'who are you'
+    }, {
+        'role': 'assistant',
+        'content': 'I am an AI'
+    }]
+    expected = '<|im_start|>system\nYou are an AI assistant whose name is '\
+        'InternLM (书生·浦语).<|im_end|>\n<|im_start|>user\nwho are you'\
+        '<|im_end|>\n<|im_start|>assistant\nI am an AI'
+    res = model.messages2prompt(messages)
+    assert res == expected
+
+
 def test_codegeex4():
     model_path_and_name = 'THUDM/codegeex4-all-9b'
     deduced_name = best_match_model(model_path_and_name)

From fd8906c1c4bc37a359b9677d0cbef694a23ab00e Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Thu, 14 Nov 2024 13:07:01 +0800
Subject: [PATCH 10/40] Support molmo in turbomind (#2716)

* initial moe support

* dynamic grouped gemm

* benchmark

* moe benchmark

* moe sampling

* split-k

* refactor tuning

* simplify

* n-major weight

* add `num` for `MatrixLayout`

* packed rows

* packed cols

* dispatch for packed rows

* w4a16 moe

* refactor model loading

* fix pytorch loader

* refactor

* dispatch w4a16 moe

* fix loader

* add comment

* fix msvc build

* fix msvc build

* fix msvc build

* fix ut

* fix ut

* fix p-lora

* add all support arches

* minor

* fix lint

* fix lint

* fix lint

* fix ut

* bf16 support

* minor

* checkin molmo conversion

* add chat template

* refactor

* fix lint

* fix ut

* Just for test: hardcode vocab_size

* minor

* minor

* minor

* fix inter_size config

* load with non-standard filenames

* fix loader

* fix missing default param

* defer the loading of misc weights for safetensors

* add embedding_size

* update

* update

* tmp

* tmp

* update molmo template

* vision embedding

* fix

* update

* fix

* fix messages2prompt in templates

* fix order of out_messages

* fix

* add user guide

* update is_supported

---------

Co-authored-by: Li Zhang <lzhang329@gmail.com>
---
 docs/en/multi_modal/index.rst                 |   2 +
 docs/en/multi_modal/molmo.md                  |  92 +++++++++
 docs/zh_cn/multi_modal/index.rst              |   2 +
 docs/zh_cn/multi_modal/molmo.md               |  92 +++++++++
 lmdeploy/archs.py                             |   3 +-
 lmdeploy/model.py                             |  31 +++
 lmdeploy/serve/vl_async_engine.py             |   5 +
 lmdeploy/turbomind/deploy/config.py           |   7 +
 .../turbomind/deploy/source_model/__init__.py |   1 +
 .../turbomind/deploy/source_model/molmo.py    | 122 ++++++++++++
 .../turbomind/deploy/target_model/base.py     |   3 +
 lmdeploy/turbomind/supported_models.py        |   8 +-
 lmdeploy/vl/model/builder.py                  |  10 +-
 lmdeploy/vl/model/molmo.py                    | 177 ++++++++++++++++++
 lmdeploy/vl/templates.py                      |  80 ++++++++
 src/turbomind/models/llama/LlamaWeight.cc     |  15 +-
 src/turbomind/models/llama/LlamaWeight.h      |   2 +
 src/turbomind/models/llama/llama_params.h     |   1 +
 .../triton_backend/llama/LlamaTritonModel.cc  |   8 +
 19 files changed, 653 insertions(+), 8 deletions(-)
 create mode 100644 docs/en/multi_modal/molmo.md
 create mode 100644 docs/zh_cn/multi_modal/molmo.md
 create mode 100644 lmdeploy/turbomind/deploy/source_model/molmo.py
 create mode 100644 lmdeploy/vl/model/molmo.py

diff --git a/docs/en/multi_modal/index.rst b/docs/en/multi_modal/index.rst
index 62f724070f..a68fe3da4f 100644
--- a/docs/en/multi_modal/index.rst
+++ b/docs/en/multi_modal/index.rst
@@ -12,3 +12,5 @@ Vision-Language Models
    minicpmv.md
    phi3.md
    mllama.md
+   qwen2_vl.md
+   molmo.md
diff --git a/docs/en/multi_modal/molmo.md b/docs/en/multi_modal/molmo.md
new file mode 100644
index 0000000000..dfff43dc64
--- /dev/null
+++ b/docs/en/multi_modal/molmo.md
@@ -0,0 +1,92 @@
+# Molmo
+
+LMDeploy supports the following molmo series of models, which are detailed in the table below:
+
+|      Model      | Size | Supported Inference Engine |
+| :-------------: | :--: | :------------------------: |
+| Molmo-7B-D-0924 |  7B  |         TurboMind          |
+|  Molmo-72-0924  | 72B  |         TurboMind          |
+
+The next chapter demonstrates how to deploy a molmo model using LMDeploy, with [Molmo-7B-D-0924](https://huggingface.co/allenai/Molmo-7B-D-0924) as an example.
+
+## Installation
+
+Please install LMDeploy by following the [installation guide](../get_started/installation.md)
+
+## Offline inference
+
+The following sample code shows the basic usage of VLM pipeline. For detailed information, please refer to [VLM Offline Inference Pipeline](./vl_pipeline.md)
+
+```python
+from lmdeploy import pipeline
+from lmdeploy.vl import load_image
+
+pipe = pipeline('allenai/Molmo-7B-D-0924')
+
+image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
+response = pipe((f'describe this image', image))
+print(response)
+```
+
+More examples are listed below:
+
+<details>
+  <summary>
+    <b>multi-image multi-round conversation, combined images</b>
+  </summary>
+
+```python
+from lmdeploy import pipeline, GenerationConfig
+
+pipe = pipeline('allenai/Molmo-7B-D-0924', log_level='INFO')
+messages = [
+    dict(role='user', content=[
+        dict(type='text', text='Describe the two images in detail.'),
+        dict(type='image_url', image_url=dict(url='https://raw.githubusercontent.com/QwenLM/Qwen-VL/master/assets/mm_tutorial/Beijing_Small.jpeg')),
+        dict(type='image_url', image_url=dict(url='https://raw.githubusercontent.com/QwenLM/Qwen-VL/master/assets/mm_tutorial/Chongqing_Small.jpeg'))
+    ])
+]
+out = pipe(messages, gen_config=GenerationConfig(do_sample=False))
+
+messages.append(dict(role='assistant', content=out.text))
+messages.append(dict(role='user', content='What are the similarities and differences between these two images.'))
+out = pipe(messages, gen_config=GenerationConfig(do_sample=False))
+```
+
+</details>
+
+## Online serving
+
+You can launch the server by the `lmdeploy serve api_server` CLI:
+
+```shell
+lmdeploy serve api_server allenai/Molmo-7B-D-0924
+```
+
+You can also start the service using the docker image:
+
+```shell
+docker run --runtime nvidia --gpus all \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
+    -p 23333:23333 \
+    --ipc=host \
+    openmmlab/lmdeploy:latest \
+    lmdeploy serve api_server allenai/Molmo-7B-D-0924
+```
+
+If you find the following logs, it means the service launches successfully.
+
+```text
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+INFO:     Started server process [2439]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on  http://0.0.0.0:23333  (Press CTRL+C to quit)
+```
+
+The arguments of `lmdeploy serve api_server` can be reviewed in detail by `lmdeploy serve api_server -h`.
+
+More information about `api_server` as well as how to access the service can be found from [here](api_server_vl.md)
diff --git a/docs/zh_cn/multi_modal/index.rst b/docs/zh_cn/multi_modal/index.rst
index 0942d8d31c..bd141ea90f 100644
--- a/docs/zh_cn/multi_modal/index.rst
+++ b/docs/zh_cn/multi_modal/index.rst
@@ -12,3 +12,5 @@
    minicpmv.md
    phi3.md
    mllama.md
+   qwen2_vl.md
+   molmo.md
diff --git a/docs/zh_cn/multi_modal/molmo.md b/docs/zh_cn/multi_modal/molmo.md
new file mode 100644
index 0000000000..1dc8f8f79b
--- /dev/null
+++ b/docs/zh_cn/multi_modal/molmo.md
@@ -0,0 +1,92 @@
+# Qwen2-VL
+
+LMDeploy 支持 Molmo 系列模型，具体如下：
+
+|      Model      | Size | Supported Inference Engine |
+| :-------------: | :--: | :------------------------: |
+| Molmo-7B-D-0924 |  7B  |         TurboMind          |
+|  Molmo-72-0924  | 72B  |         TurboMind          |
+
+本文将以[Molmo-7B-D-0924](https://huggingface.co/allenai/Molmo-7B-D-0924) 为例，演示使用 LMDeploy 部署 Molmo 系列模型的方法
+
+## 安装
+
+请参考[安装文档](../get_started/installation.md)安装 LMDeploy。
+
+## 离线推理
+
+以下是使用 pipeline 进行离线推理的示例，更多用法参考[VLM离线推理 pipeline](./vl_pipeline.md)
+
+```python
+from lmdeploy import pipeline
+from lmdeploy.vl import load_image
+
+pipe = pipeline('allenai/Molmo-7B-D-0924')
+
+image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
+response = pipe((f'describe this image', image))
+print(response)
+```
+
+更多例子如下：
+
+<details>
+  <summary>
+    <b>多图多轮对话</b>
+  </summary>
+
+```python
+from lmdeploy import pipeline, GenerationConfig
+
+pipe = pipeline('Qwen/Qwen2-VL-2B-Instruct', log_level='INFO')
+messages = [
+    dict(role='user', content=[
+        dict(type='text', text='Describe the two images in detail.'),
+        dict(type='image_url', image_url=dict(url='https://raw.githubusercontent.com/QwenLM/Qwen-VL/master/assets/mm_tutorial/Beijing_Small.jpeg')),
+        dict(type='image_url', image_url=dict(url='https://raw.githubusercontent.com/QwenLM/Qwen-VL/master/assets/mm_tutorial/Chongqing_Small.jpeg'))
+    ])
+]
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+
+messages.append(dict(role='assistant', content=out.text))
+messages.append(dict(role='user', content='What are the similarities and differences between these two images.'))
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+```
+
+</details>
+
+## 在线服务
+
+你可以通过 `lmdeploy serve api_server` CLI 工具启动服务：
+
+```shell
+lmdeploy serve api_server Qwen/Qwen2-VL-2B-Instruct
+```
+
+也可以基于 docker image 启动服务：
+
+```shell
+docker run --runtime nvidia --gpus all \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
+    -p 23333:23333 \
+    --ipc=host \
+    openmmlab/lmdeploy:qwen2vl \
+    lmdeploy serve api_server Qwen/Qwen2-VL-2B-Instruct
+```
+
+如果日志中有如下信息，就表明服务启动成功了。
+
+```text
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+INFO:     Started server process [2439]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on  http://0.0.0.0:23333  (Press CTRL+C to quit)
+```
+
+有关 `lmdeploy serve api_server` 的详细参数可以通过`lmdeploy serve api_server -h`查阅。
+
+关于 `api_server` 更多的介绍，以及访问 `api_server` 的方法，请阅读[此处](api_server_vl.md)
diff --git a/lmdeploy/archs.py b/lmdeploy/archs.py
index 8284c99741..ce5cbd98ff 100644
--- a/lmdeploy/archs.py
+++ b/lmdeploy/archs.py
@@ -121,7 +121,8 @@ def check_vl_llm(config: dict) -> bool:
         'InternVLChatModel', 'MiniGeminiLlamaForCausalLM',
         'MGMLlamaForCausalLM', 'MiniCPMV', 'LlavaForConditionalGeneration',
         'LlavaNextForConditionalGeneration', 'Phi3VForCausalLM',
-        'Qwen2VLForConditionalGeneration', 'MllamaForConditionalGeneration'
+        'Qwen2VLForConditionalGeneration', 'MllamaForConditionalGeneration',
+        'MolmoForCausalLM'
     ])
     if arch == 'QWenLMHeadModel' and 'visual' in config:
         return True
diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index db864a8344..c9eb71c2c3 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -1747,6 +1747,37 @@ def match(cls, model_path: str) -> Optional[str]:
             return 'internvl-phi3'
 
 
+@MODELS.register_module(name='molmo')
+class Molmo(BaseChatTemplate):
+
+    def __init__(self,
+                 user=' User: ',
+                 eoh='',
+                 assistant=' Assistant:',
+                 eoa='',
+                 separator=' ',
+                 stop_words=['<|endoftext|>'],
+                 **kwargs):
+        super().__init__(user=user,
+                         eoh=eoh,
+                         assistant=assistant,
+                         eoa=eoa,
+                         separator=separator,
+                         stop_words=stop_words,
+                         **kwargs)
+
+    @classmethod
+    def match(cls, model_path: str) -> Optional[str]:
+        """Return the model_name that was registered to MODELS.
+
+        Args:
+            model_path (str): the model path used for matching.
+        """
+        path = model_path.lower()
+        if 'molmo' in path:
+            return 'molmo'
+
+
 def best_match_model(query: str) -> Optional[str]:
     """Get the model that matches the query.
 
diff --git a/lmdeploy/serve/vl_async_engine.py b/lmdeploy/serve/vl_async_engine.py
index fd0b0bb5e4..c293cd71c8 100644
--- a/lmdeploy/serve/vl_async_engine.py
+++ b/lmdeploy/serve/vl_async_engine.py
@@ -64,6 +64,7 @@ async def _get_prompt_input(self,
         results = {}
         input_ids = []
         from lmdeploy.vl.templates import (MllamaTempateWrapper,
+                                           MolmoChatTemplateWrapper,
                                            Qwen2VLChatTemplateWrapper)
         ranges = None
         grid_thws = None
@@ -99,6 +100,10 @@ async def _get_prompt_input(self,
                     results['cross_attention_states'] = features[0]
                     return results
 
+                if isinstance(self.vl_prompt_template,
+                              MolmoChatTemplateWrapper):
+                    return features[0]
+
             features = [x.cpu().numpy() for x in features]
             input_ids = []
             begins = []
diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py
index a535b0d4c1..c724b085a0 100644
--- a/lmdeploy/turbomind/deploy/config.py
+++ b/lmdeploy/turbomind/deploy/config.py
@@ -35,6 +35,13 @@ class ModelConfig:
     kv_head_num: int = None
     hidden_units: int = None
     vocab_size: int = None
+    # Turbomind used to assume token_embedding and lm_head has the same size
+    # at vocab dim, i.e. `vocab_size`
+    # But in molmo, embedding.shape is [vocab_size + 128, hidden_units]
+    # while lm_head shape is [hidden_units, vocab_size].
+    # Therefore, we add a new attr "embedding_size" to represent the vocab dim
+    # of token_embedding
+    embedding_size: int = 0
     num_layer: int = None
     inter_size: int = None
     norm_eps: float = None
diff --git a/lmdeploy/turbomind/deploy/source_model/__init__.py b/lmdeploy/turbomind/deploy/source_model/__init__.py
index b1da698e2e..de16bdc0a0 100644
--- a/lmdeploy/turbomind/deploy/source_model/__init__.py
+++ b/lmdeploy/turbomind/deploy/source_model/__init__.py
@@ -9,5 +9,6 @@
 from .meta_llama import MetaLlamaModel  # noqa: F401
 from .minicpmv import MiniCPMVModel  # noqa: F401
 from .mixtral import MixtralModel  # noqa: F401
+from .molmo import MolmoModel  # noqa: F401
 from .qwen import QwenModel  # noqa: F401
 from .xcomposer2 import Xcomposer2Model  # noqa: F401
diff --git a/lmdeploy/turbomind/deploy/source_model/molmo.py b/lmdeploy/turbomind/deploy/source_model/molmo.py
new file mode 100644
index 0000000000..541e201046
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/source_model/molmo.py
@@ -0,0 +1,122 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+
+import torch
+
+from .base import INPUT_MODELS
+from .llama import LlamaModel, LlamaReader
+
+
+class MolmoReader(LlamaReader):
+    attn_layer_prefix = 'model.transformer.blocks'
+    attn_layer_patten = r'model.transformer.blocks.([0-9]+).'
+    norm_weight_key = 'model.transformer.ln_f.weight'
+    output_weight_key = 'model.transformer.ff_out.weight'
+
+    # In molmo, names of attention parameters are "att_proj.bias",
+    # "att_proj.weight", "attn_norm.weight", "attn_out.weight", and names
+    # of ffn parameters are "ff_norm", "ff_out", "ff_proj", so we
+    # make the patterns are r'att' and r'ffn_', respectively.
+    attn_pattern = r'att'
+    ffn_pattern = r'ff_'
+
+    def tok_embeddings(self):
+        embed1 = self.params.get('model.transformer.wte.embedding', None)
+        embed2 = self.params.get('model.transformer.wte.new_embedding', None)
+        if embed1 is not None and embed2 is not None:
+            return torch.cat((embed1, embed2), dim=0)
+        else:
+            assert embed1 is None and embed2 is None
+            return None
+
+    def attn_norm(self, i: int):
+        """Get attn norm for layer i."""
+        return self.params[f'{self.attn_layer_prefix}.{i}.attn_norm.weight']
+
+    def _attn(self, i: int, kind: str):
+        """Get q, k, v, o kind(weight, bias, qweight) for layer i.
+
+        Args:
+            i (int): layer id
+            kind (str): can be one of ["weight", "bias", "qweight"]
+        """
+        q, k, v = (None, ) * 3
+        hidden_size = self.model_cfg['hidden_size']
+        head_num = self.model_cfg['num_attention_heads']
+        kv_head_num = self.model_cfg['num_key_value_heads']
+        head_dim = hidden_size // head_num
+        assert head_dim == 128
+        fused_dims = (hidden_size, kv_head_num * head_dim,
+                      kv_head_num * head_dim)
+        qkv = self.params.get(f'{self.attn_layer_prefix}.{i}.att_proj.{kind}')
+        qkv = self.transform(qkv, kind)
+        if qkv is not None:
+            q, k, v = qkv.split(fused_dims, dim=0)
+        o = self.params.get(f'{self.attn_layer_prefix}.{i}.attn_out.{kind}')
+        o = self.transform(o, kind)
+        if o is None:  # handle the case when qkv has bias but o doesn't
+            o = torch.zeros_like(q)
+        return (q, k, v, o)
+
+    def _ffn(self, i: int, kind: str):
+        """Get ffn kind(weight, qweight) for layer i."""
+        up_and_gate = self.params[
+            f'{self.attn_layer_prefix}.{i}.ff_proj.{kind}']
+        up_and_gate = self.transform(up_and_gate, kind)
+        gate, up = up_and_gate.chunk(2, dim=0)
+        down = self.params[f'{self.attn_layer_prefix}.{i}.ff_out.{kind}']
+        down = self.transform(down, kind)
+        return (up, down, gate)
+
+    def ffn_norm(self, i: int):
+        """Get ffn norm for layer i."""
+        return self.params[f'{self.attn_layer_prefix}.{i}.ff_norm.weight']
+
+
+@INPUT_MODELS.register_module(name='molmo')
+class MolmoModel(LlamaModel):
+
+    Reader = MolmoReader
+
+    def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
+        super().__init__(model_path, tokenizer_path, **kwargs)
+        config_path = osp.join(self.model_path, 'config.json')
+        with open(config_path) as f:
+            self.config = json.load(f)
+
+    def tokenizer_info(self):
+
+        n_words = 152064
+        bos_id = 151643
+        eos_id = 151643
+        return n_words, bos_id, eos_id
+
+    def model_info(self):
+        config = self.config
+        num_layer = config['num_hidden_layers']
+        norm_eps = config['layer_norm_eps']
+        attn_head_num = config['num_attention_heads']
+        kv_head_num = config['num_key_value_heads']
+        hidden_units = config['hidden_size']
+        rope_theta = config['rope_theta']
+        max_position_embeddings = config['max_position_embeddings']
+        vocab_size = config['vocab_size']
+        # https://huggingface.co/allenai/Molmo-7B-D-0924/blob/main/modeling_molmo.py#L2041
+        additional_vocab_size = 128
+        inter_size = config['intermediate_size'] // 2
+        attn_bias = config['qkv_bias']
+        return dict(
+            num_layer=num_layer,
+            norm_eps=norm_eps,
+            head_num=attn_head_num,
+            kv_head_num=kv_head_num,
+            hidden_units=hidden_units,
+            attn_bias=int(attn_bias),
+            inter_size=inter_size,
+            vocab_size=vocab_size,
+            # https://huggingface.co/allenai/Molmo-7B-D-0924/blob/main/modeling_molmo.py#L564
+            embedding_size=vocab_size + additional_vocab_size,
+            rope_theta=rope_theta,
+            max_position_embeddings=max_position_embeddings,
+        )
diff --git a/lmdeploy/turbomind/deploy/target_model/base.py b/lmdeploy/turbomind/deploy/target_model/base.py
index abd570cd00..09699ade09 100644
--- a/lmdeploy/turbomind/deploy/target_model/base.py
+++ b/lmdeploy/turbomind/deploy/target_model/base.py
@@ -92,6 +92,9 @@ def update_model_config(self):
         final_cfg = config_to_dict(self.model_config)
         final_cfg.update(dict(start_id=bos_id, end_id=eos_id))
         final_cfg.update(self.input_model_info)
+        if 'embedding_size' not in self.input_model_info.keys():
+            final_cfg.update(
+                embedding_size=self.input_model_info['vocab_size'])
 
         self.model_config = config_from_dict(ModelConfig, final_cfg)
 
diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py
index bb3533254b..e66da22df0 100644
--- a/lmdeploy/turbomind/supported_models.py
+++ b/lmdeploy/turbomind/supported_models.py
@@ -42,7 +42,9 @@
     ChatGLMModel='glm4',
     ChatGLMForConditionalGeneration='glm4',
     # mixtral
-    MixtralForCausalLM='mixtral')
+    MixtralForCausalLM='mixtral',
+    MolmoForCausalLM='molmo',
+)
 
 
 def is_supported(model_path: str):
@@ -104,5 +106,9 @@ def _is_head_dim_supported(cfg):
                 if llm_arch in ['Qwen2ForCausalLM', 'LlamaForCausalLM']:
                     support_by_turbomind = _is_head_dim_supported(
                         cfg.text_config)
+            elif arch == 'MolmoForCausalLM':
+                kv_heads = cfg.num_key_value_heads
+                # TM hasn't supported allenai/Molmo-7B-O-0924 yet
+                support_by_turbomind = kv_heads is not None
 
     return support_by_turbomind
diff --git a/lmdeploy/vl/model/builder.py b/lmdeploy/vl/model/builder.py
index 9e71f7d1c0..2401b42259 100644
--- a/lmdeploy/vl/model/builder.py
+++ b/lmdeploy/vl/model/builder.py
@@ -18,6 +18,7 @@
 from .mini_gemeni import MiniGeminiVisionModel  # noqa F401
 from .minicpmv import MiniCPMVModel  # noqa F401
 from .mllama import MllamaVLModel  # noqa F401
+from .molmo import MolmoVisionModel  # noqa F401
 from .phi3_vision import Phi3VisionModel  # noqa F401
 from .qwen import QwenVisionModel  # noqa F401
 from .qwen2 import Qwen2VLModel  # noqa F401
@@ -31,7 +32,14 @@ def load_vl_model(model_path: str,
                   with_llm: bool = False,
                   backend_config: Optional[Union[TurbomindEngineConfig,
                                                  PytorchEngineConfig]] = None):
-    """load visual model."""
+    """load visual model.
+
+    Args:
+        model_path(str): the path or repo_id from model hub of the model
+        with_llm(bool): whether to remove the LLM part from the model.
+            When it is False, it means removing LLM part
+        backend_config: the config of the inference engine
+    """
     if not os.path.exists(model_path):
         revision = getattr(backend_config, 'revision', None)
         download_dir = getattr(backend_config, 'download_dir', None)
diff --git a/lmdeploy/vl/model/molmo.py b/lmdeploy/vl/model/molmo.py
new file mode 100644
index 0000000000..9abae7a309
--- /dev/null
+++ b/lmdeploy/vl/model/molmo.py
@@ -0,0 +1,177 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from typing import Dict, List
+
+import torch
+from PIL.Image import Image
+from transformers import AutoModelForCausalLM, AutoProcessor
+
+from lmdeploy.utils import get_logger
+from lmdeploy.vl.constants import IMAGE_TOKEN
+from lmdeploy.vl.model.base import VISION_MODELS, VisonModel
+from lmdeploy.vl.model.utils import disable_logging
+
+logger = get_logger('lmdeploy')
+
+
+@VISION_MODELS.register_module()
+class MolmoVisionModel(VisonModel):
+    """molmo's vision model."""
+
+    _arch = 'MolmoForCausalLM'
+
+    def build_model(self):
+        """Load model."""
+        from accelerate import init_empty_weights, load_checkpoint_and_dispatch
+        with init_empty_weights():
+            config = self.hf_config
+            model = AutoModelForCausalLM.from_config(config,
+                                                     trust_remote_code=True)
+            if not self.with_llm:
+                # Remove nn modules other than embedding from the LLM model
+                for key in ['emb_drop', 'ln_f', 'blocks', 'ff_out']:
+                    del model.model.transformer[key]
+                self.token_embedding = model.model.transformer.wte
+            else:
+                self.vl_model = model
+
+        with disable_logging():
+            load_checkpoint_and_dispatch(
+                model=model,
+                checkpoint=self.model_path,
+                device_map='auto' if not self.with_llm else {'': 'cpu'},
+                max_memory=self.max_memory,
+                no_split_module_classes=[
+                    'ResidualAttentionBlock', 'Embedding'
+                ])
+
+        # We need eval mode to freeze the weights in model, thus,
+        # avoid randomness in inference.
+        self.model = model.eval()
+        self.config = config
+
+        self.processor = AutoProcessor.from_pretrained(self.model_path,
+                                                       trust_remote_code=True,
+                                                       torch_dtype='auto',
+                                                       device_map='auto')
+
+    @torch.no_grad()
+    def forward(self,
+                images: List[Image],
+                params: List[Dict] = None) -> List[Dict]:
+        """forward the model with given input.
+
+        Args:
+            images (List): [None] it is not used
+            params (List): the inputs after precessing GPT4V messages in
+                `MolmoChatTemplateWrapper`. Its format is like the following:
+                [[
+                    {'role': 'user', 'content': 'user prompt'},
+                    {'role': 'asssistant', 'content': 'assistant prompt'},
+                    {'role': 'user', 'content': 'user prompt', 'images': [PIL image list]},
+                    ...
+                ]]
+        """  # noqa
+
+        messages = params[0]
+        assert isinstance(messages, List)
+        # append an assistant message to `messages`
+        messages.append(dict(role='assistant', content=''))
+        # results is a list of tuple(input_ids, embeddings)
+        results = []
+        # the concat prompt. It is not used during inference but to adhere the
+        # interface definition of `_get_prompt_input` in `class VLAsyncEngine`
+        prompts = ''
+        # Prepend BOS
+        # qwen2 and olmo do not have a BOS, and instead use EOS as a generic
+        # separator token.
+        bos = (self.processor.tokenizer.bos_token_id
+               or self.processor.tokenizer.eos_token_id)
+        results.append(([bos], None))
+        for i, message in enumerate(messages):
+            if 'images' in message.keys():
+                prompts += ' User: ' + (IMAGE_TOKEN + '\n') * len(
+                    message['images']) + message['content']
+                prompt = f' User: {message["content"]}'
+                tokens = self.processor.tokenizer.encode(
+                    prompt, add_special_tokens=False)
+                # preprocess images. The output is a dict
+                inputs = self.processor.process(images=message['images'],
+                                                tokens=tokens)
+                inputs = {
+                    k: v.to(self.model.device).unsqueeze(0)
+                    for k, v in inputs.items()
+                }
+                input_ids = inputs['input_ids']
+                # remove the bos from input_ids which is prepended by molmo's
+                # processor
+                input_ids = input_ids[:, 1:]
+                images = inputs[
+                    'images']  # (batch_size, num_image, num_patch, d_model)
+                image_input_idx = inputs[
+                    'image_input_idx']  # (batch_size, num_image, num_patch)
+                image_masks = inputs['image_masks']
+                batch_size, seq_len = input_ids.size()
+                assert batch_size == 1
+
+                # Get embeddings of input.
+                if input_ids is not None:
+                    input_ids = input_ids * (input_ids != -1).to(
+                        input_ids.dtype)
+                embeddings = self.model.model.transformer.wte(input_ids)
+                image_features, _ = self.model.model.vision_backbone(
+                    images, image_masks)
+                num_image, num_patch = image_features.shape[1:3]
+                assert image_input_idx.shape == (batch_size, num_image,
+                                                 num_patch)
+
+                # insert the image feature into the embedding.
+                image_features = image_features.view(batch_size,
+                                                     num_image * num_patch, -1)
+                image_input_idx = image_input_idx.view(batch_size,
+                                                       num_image * num_patch)
+
+                valid = image_input_idx >= 0
+                batch_idx = torch.arange(batch_size, device=embeddings.device)
+                batch_idx = torch.tile(batch_idx[:, None],
+                                       [1, image_features.shape[1]])
+                image_features = image_features.to(embeddings.device)
+                embeddings[batch_idx[valid],
+                           image_input_idx[valid]] += image_features[valid]
+                assert embeddings.shape[:2] == (batch_size, seq_len)
+                results.append((input_ids.flatten().tolist(), embeddings))
+            else:
+                role = message['role']
+                content = message['content']
+                assert isinstance(content, str)
+                prompt = ''
+                if role == 'user':
+                    prompt = f' User: {content}'
+                elif role == 'assistant':
+                    prompt = f' Assistant:{content}'
+                else:
+                    assert 0, f'molmo does not support role {role}, message is {message}'  # noqa
+                input_ids = self.processor.tokenizer.encode(
+                    prompt, add_special_tokens=False)
+                results.append((input_ids, None))
+                prompts += prompt
+
+        # concat input_ids from results, calculate the range in the input_ids
+        # where embeddings will be copied to
+        input_ids = []
+        input_embeddings = []
+        input_embedding_ranges = []
+        start = 0
+        for _input_ids, _embeddings in results:
+            if _embeddings is not None:
+                input_embeddings.append(_embeddings.cpu())
+                end = start + len(_input_ids)
+                input_embedding_ranges.append((start, end))
+            input_ids += _input_ids
+            start += len(_input_ids)
+        return [
+            dict(prompt=prompts,
+                 input_ids=input_ids,
+                 input_embeddings=input_embeddings,
+                 input_embedding_ranges=input_embedding_ranges)
+        ]
diff --git a/lmdeploy/vl/templates.py b/lmdeploy/vl/templates.py
index 45e457ad2c..cdf398868a 100644
--- a/lmdeploy/vl/templates.py
+++ b/lmdeploy/vl/templates.py
@@ -428,6 +428,84 @@ class GLM4VChatTemplateWrapper(VLChatTemplateWrapper):
     pass
 
 
+class MolmoChatTemplateWrapper(VLChatTemplateWrapper):
+
+    async def async_collect_pil_images(
+            self, messages: List[Dict]) -> List[Tuple[PIL.Image.Image, Dict]]:
+        """collect images from messages.
+
+        Args:
+            messages (List[Dict]): a user request of GPT4V message format
+        """
+        if isinstance(messages, Dict):
+            messages = [messages]
+        assert isinstance(messages, List)
+
+        out_messages = [None] * len(messages)
+
+        def _inner_call(i, in_messages, out_messages):
+            role = in_messages[i]['role']
+            content = in_messages[i]['content']
+            if role != 'user' or isinstance(content, str):
+                # means message is user's prompt input or assistant's prompt,
+                # returning it directory
+                out_messages[i] = in_messages[i]
+                return
+            # the role is a user and the content is a list
+            assert isinstance(content, List)
+            message = dict(role=role, content='', images=[])
+            for item in content:
+                # 'image_url': means url or local path to image.
+                # 'image_data': means PIL.Image.Image object.
+                if item['type'] == 'image_url':
+                    try:
+                        image = load_image(item['image_url']['url'])
+                        message['images'].append(image)
+                    except KeyError:
+                        logger.error(f'invalid format {message}')
+                elif item['type'] == 'image_data':
+                    try:
+                        image = load_image(item['image_data']['data'])
+                        message['images'].append(image)
+                    except KeyError:
+                        logger.error(f'invalid format {message}')
+                elif item['type'] == 'text':
+                    message['content'] = item['text']
+                else:
+                    logger.error(f'unexpected content type {message}')
+            out_messages[i] = message
+
+        await asyncio.gather(*[
+            asyncio.get_event_loop().run_in_executor(None, _inner_call, i,
+                                                     messages, out_messages)
+            for i in range(len(messages))
+        ])
+        return [(None, out_messages)]
+
+    def messages2prompt(self, messages, sequence_start=True, **kwargs) -> str:
+        """Return a placeholder "IMAGE_TOKEN" so that
+        `vl_asyn_engine._get_prompt_input` can know that it."""
+        if isinstance(messages, str):
+            return self.chat_template.messages2prompt(messages, sequence_start)
+        else:
+            _messages = []
+            for message in messages:
+                role, content = message['role'], message['content']
+                if role != 'user' or isinstance(content, str):
+                    _messages.append(message)
+                    continue
+                for item in content:
+                    item_type = item['type']
+                    if item_type in ['image_url', 'image_data']:
+                        # Return the image placeholder so that
+                        # `vl_asyn_engine._get_prompt_input` can know that the
+                        # request contains images
+                        return IMAGE_TOKEN
+                    _messages.append(dict(role=role, content=item[item_type]))
+            return self.chat_template.messages2prompt(_messages,
+                                                      sequence_start)
+
+
 def get_vl_prompt_template(model_path: str, chat_template: BaseModel,
                            model_name: str) -> VLChatTemplateWrapper:
     """get vision language prompt template."""
@@ -467,4 +545,6 @@ def get_vl_prompt_template(model_path: str, chat_template: BaseModel,
         return GLM4VChatTemplateWrapper(chat_template)
     elif arch == 'Qwen2VLForConditionalGeneration':
         return Qwen2VLChatTemplateWrapper(chat_template)
+    elif arch == 'MolmoForCausalLM':
+        return MolmoChatTemplateWrapper(chat_template)
     raise ValueError(f'unsupported vl_prompt_template with arch {arch}')
diff --git a/src/turbomind/models/llama/LlamaWeight.cc b/src/turbomind/models/llama/LlamaWeight.cc
index 1ac2d82dd9..9d62042d62 100644
--- a/src/turbomind/models/llama/LlamaWeight.cc
+++ b/src/turbomind/models/llama/LlamaWeight.cc
@@ -32,6 +32,7 @@ LlamaWeight<T>::LlamaWeight(size_t     head_num,
                             size_t     hidden_units,
                             size_t     inter_size,
                             size_t     vocab_size,
+                            size_t     embedding_size,
                             size_t     num_layer,
                             bool       attn_bias,
                             WeightType weight_type,
@@ -44,16 +45,20 @@ LlamaWeight<T>::LlamaWeight(size_t     head_num,
     inter_size_(inter_size),
     vocab_size_(vocab_size),
     vocab_size_padded_(vocab_size),
+    embedding_size_(embedding_size),
     num_layer_(num_layer),
     weight_type_(weight_type),
     tensor_para_size_(tensor_para_size),
     tensor_para_rank_(tensor_para_rank)
 {
     if (vocab_size_padded_ % tensor_para_size_ != 0) {
-        vocab_size_padded_ = (vocab_size_padded_ + tensor_para_size_ - 1) / tensor_para_size_ * tensor_para_size_;
+        vocab_size_padded_ = (vocab_size_ + tensor_para_size_ - 1) / tensor_para_size_ * tensor_para_size_;
         TM_LOG_WARNING("pad vocab size from %d to %d", vocab_size_, vocab_size_padded_);
     }
-
+    if (embedding_size_ % tensor_para_size_ != 0) {
+        embedding_size_ = (embedding_size_ + tensor_para_size_ - 1) / tensor_para_size_ * tensor_para_size_;
+        TM_LOG_WARNING("pad embed size from %d to %d", embedding_size_, embedding_size_);
+    }
     FT_CHECK(hidden_units_ % tensor_para_size_ == 0);
 
     decoder_layer_weights.reserve(num_layer_);
@@ -96,7 +101,7 @@ template<typename T>
 void LlamaWeight<T>::mallocWeights()
 {
     FT_CHECK(vocab_size_padded_ % tensor_para_size_ == 0);
-    deviceMalloc((T**)&pre_decoder_embedding_table, vocab_size_padded_ * hidden_units_ / tensor_para_size_);
+    deviceMalloc((T**)&pre_decoder_embedding_table, embedding_size_ * hidden_units_ / tensor_para_size_);
     deviceMalloc((T**)&output_norm_weight, hidden_units_);
     deviceMalloc((T**)&post_decoder_embedding_kernel, hidden_units_ * vocab_size_padded_ / tensor_para_size_);
 }
@@ -111,7 +116,7 @@ void LlamaWeight<T>::loadModel(std::string dir_path)
     dir_path += '/';
 
     loadWeightFromBin((T*)pre_decoder_embedding_table,
-                      {vocab_size_padded_ * hidden_units_ / tensor_para_size_},
+                      {embedding_size_ * hidden_units_ / tensor_para_size_},
                       dir_path + "tok_embeddings." + std::to_string(tensor_para_rank_) + ".weight",
                       model_file_type);
 
@@ -135,7 +140,7 @@ TensorMap LlamaWeight<T>::getParams()
     output.insert("tok_embeddings." + std::to_string(tensor_para_rank_) + ".weight",
                   Tensor{MEMORY_GPU,
                          getTensorType<T>(),
-                         {vocab_size_padded_ * hidden_units_ / tensor_para_size_ * sizeof(T)},
+                         {embedding_size_ * hidden_units_ / tensor_para_size_ * sizeof(T)},
                          pre_decoder_embedding_table});
 
     output.insert("norm.weight",
diff --git a/src/turbomind/models/llama/LlamaWeight.h b/src/turbomind/models/llama/LlamaWeight.h
index c04bf6c5a6..c30e753565 100644
--- a/src/turbomind/models/llama/LlamaWeight.h
+++ b/src/turbomind/models/llama/LlamaWeight.h
@@ -35,6 +35,7 @@ struct LlamaWeight {
                 size_t     hidden_units,
                 size_t     inter_size,
                 size_t     vocab_size,
+                size_t     embedding_size,
                 size_t     num_layer,
                 bool       attn_bias,
                 WeightType weight_type,
@@ -67,6 +68,7 @@ struct LlamaWeight {
     size_t     inter_size_;
     size_t     vocab_size_;
     size_t     vocab_size_padded_;
+    size_t     embedding_size_;
     size_t     num_layer_;
     WeightType weight_type_;
     size_t     tensor_para_size_;
diff --git a/src/turbomind/models/llama/llama_params.h b/src/turbomind/models/llama/llama_params.h
index 2ea63f0410..e6b9d690ae 100644
--- a/src/turbomind/models/llama/llama_params.h
+++ b/src/turbomind/models/llama/llama_params.h
@@ -18,6 +18,7 @@ struct ModelParam {
     size_t layer_num;
     size_t inter_size;
     size_t vocab_size;
+    size_t embedding_size;
     float  norm_eps;
     int    quant_policy;
     //
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
index 38552be0cf..2deca46380 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -133,6 +133,12 @@ void LlamaTritonModel<T>::handleMissingParams()
                        (int)model_param_.kv_head_num);
     }
 
+    if (model_param_.embedding_size == 0) {
+        model_param_.embedding_size = model_param_.vocab_size;
+        TM_LOG_WARNING("[LlamaTritonModel] `embedding_size` is not set, default to `vocab_size` (%d).",
+                       (int)model_param_.vocab_size);
+    }
+
     if (!attn_param_.max_position_embeddings) {
         attn_param_.max_position_embeddings = 2048;
         TM_LOG_WARNING("[LlamaTritonModel] `max_position_embeddings` is not set, default to %d.",
@@ -252,6 +258,7 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
     model_param_.layer_num          = model_reader["num_layer"].as<int>();
     model_param_.inter_size         = model_reader["inter_size"].as<int>();
     model_param_.vocab_size         = model_reader["vocab_size"].as<int>();
+    model_param_.embedding_size     = model_reader["embedding_size"].as<int>();
     model_param_.norm_eps           = model_reader["norm_eps"].as<float>();
     model_param_.start_id           = model_reader["start_id"].as<int>();
     model_param_.end_id             = model_reader["end_id"].as<int>();
@@ -417,6 +424,7 @@ void LlamaTritonModel<T>::createSharedWeights(int device_id, int rank)
                                                                model_param_.hidden_units,
                                                                model_param_.inter_size,
                                                                model_param_.vocab_size,
+                                                               model_param_.embedding_size,
                                                                model_param_.layer_num,
                                                                attn_bias_,
                                                                weight_type_,

From 59c1c63b992eb332f6408554f1d6de146a6f7733 Mon Sep 17 00:00:00 2001
From: jinminxi104 <jinminxi104@hotmail.com>
Date: Thu, 14 Nov 2024 13:12:39 +0800
Subject: [PATCH 11/40] Update ascend readme (#2756)

* Update get_started.md of ascend

* Update get_started.md of ascend
---
 docs/en/get_started/ascend/get_started.md    | 4 ++--
 docs/zh_cn/get_started/ascend/get_started.md | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/en/get_started/ascend/get_started.md b/docs/en/get_started/ascend/get_started.md
index a5400ed64d..9e963b3795 100644
--- a/docs/en/get_started/ascend/get_started.md
+++ b/docs/en/get_started/ascend/get_started.md
@@ -49,7 +49,7 @@ For more information about running the Docker client on Ascend devices, please r
 ## Offline batch inference
 
 > \[!TIP\]
-> Graph mode has been supported on Atlas 800T A2. Currently, LLaMa3-8B/LLaMa2-7B/Qwen2-7B are tested on graph mode.
+> Graph mode has been supported on Atlas 800T A2.
 > Users can set `eager_mode=False` to enable graph mode, or, set `eager_mode=True` to disable graph mode.
 > (Please source `/usr/local/Ascend/nnal/atb/set_env.sh` before enabling graph mode)
 
@@ -86,7 +86,7 @@ if __name__ == "__main__":
 ## Online serving
 
 > \[!TIP\]
-> Graph mode has been supported on Atlas 800T A2. Currently, InternLM2-7B/LLaMa2-7B/Qwen2-7B are tested on graph mode.
+> Graph mode has been supported on Atlas 800T A2.
 > Graph mode is default enabled in online serving. Users can add `--eager-mode` to disable graph mode.
 > (Please source `/usr/local/Ascend/nnal/atb/set_env.sh` before enabling graph mode)
 
diff --git a/docs/zh_cn/get_started/ascend/get_started.md b/docs/zh_cn/get_started/ascend/get_started.md
index e00c1e173a..046aea756b 100644
--- a/docs/zh_cn/get_started/ascend/get_started.md
+++ b/docs/zh_cn/get_started/ascend/get_started.md
@@ -49,7 +49,7 @@ docker run -e ASCEND_VISIBLE_DEVICES=0 --rm --name lmdeploy -t lmdeploy-aarch64-
 ## 离线批处理
 
 > \[!TIP\]
-> 图模式已经支持了Atlas 800T A2。目前，单卡下的LLaMa3-8B/LLaMa2-7B/Qwen2-7B已经通过测试。用户可以设定`eager_mode=False`来开启图模式，或者设定`eager_mode=True`来关闭图模式。(启动图模式需要事先source `/usr/local/Ascend/nnal/atb/set_env.sh`)
+> 图模式已经支持了Atlas 800T A2。用户可以设定`eager_mode=False`来开启图模式，或者设定`eager_mode=True`来关闭图模式。(启动图模式需要事先source `/usr/local/Ascend/nnal/atb/set_env.sh`)
 
 ### LLM 推理
 
@@ -84,7 +84,7 @@ if __name__ == "__main__":
 ## 在线服务
 
 > \[!TIP\]
-> 图模式已经支持Atlas 800T A2。目前，单卡下的InternLM2-7B/LLaMa2-7B/Qwen2-7B已经通过测试。
+> 图模式已经支持Atlas 800T A2。
 > 在线服务时，图模式默认开启，用户可以添加`--eager-mode`来关闭图模式。(启动图模式需要事先source `/usr/local/Ascend/nnal/atb/set_env.sh`)
 
 ### LLM 模型服务

From 8e0076a059cd27d1ba1deb2801af2c3668c563d4 Mon Sep 17 00:00:00 2001
From: tangzhiyi11 <tangzhiyi11@users.noreply.github.com>
Date: Thu, 14 Nov 2024 14:29:59 +0800
Subject: [PATCH 12/40] feat: support multi cards in ascend graph mode (#2755)

* support multi cards in ascend graph mode

* update warning info

* update warning info
---
 .../backends/dlinfer/ascend/graph_runner.py   | 28 ++++++++-----------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/graph_runner.py b/lmdeploy/pytorch/backends/dlinfer/ascend/graph_runner.py
index 7dbb86d4b6..b69cb1dca5 100644
--- a/lmdeploy/pytorch/backends/dlinfer/ascend/graph_runner.py
+++ b/lmdeploy/pytorch/backends/dlinfer/ascend/graph_runner.py
@@ -22,7 +22,6 @@ def __init__(self, model: torch.nn.Module, model_config: ModelConfig,
         super().__init__(model, model_config, cache_config, backend_config,
                          device)
 
-        self.supported_model = ['Llama3-8B', 'Llama2-7B', 'Qwen2-7B']
         self.enable_graph = self.check_enable_graph()
         if self.enable_graph:
             import dlinfer.graph
@@ -39,26 +38,23 @@ def check_enable_graph(self):
         # eager_mode
         if self.backend_config.eager_mode:
             return False
-        # tp
-        if torch.distributed.is_initialized():
-            warnings.warn(
-                "Graph mode of device_type 'ascend' only supports tp=1 "
-                'for now, fallback to eager mode', RuntimeWarning)
-            return False
 
         warnings.warn(
             '\n\n'
-            '**********************************************************\n'
-            '  The following models were tested in graph mode of\n'
-            "  device_type 'ascend' when tp=1:\n"
-            f"  {', '.join(self.supported_model)}\n"
-            '  Other LLaMa-like models may work in graph mode, please\n'
-            '  check the result yourself!\n'
-            '  If graph mode does not work correctly with your model,\n'
-            '  please use eager mode instead.\n'
-            '**********************************************************\n\n',
+            '************************************************************\n'
+            '  Graph mode is an experimental feature. We currently\n'
+            '  support both dense and Mixture of Experts (MoE) models\n'
+            '  with bf16 and fp16 data types.\n'
+            '  If graph mode does not function correctly with your model,\n'
+            '  please consider using eager mode as an alternative.\n'
+            '************************************************************\n\n',
             RuntimeWarning)
 
+        # tp
+        if torch.distributed.is_initialized():
+            torch._inductor.config.compile_threads = 1
+            return True
+
         return True
 
     def patch_kernels_custom_op(self):

From 21f2866e54a0acd3b7b241f4e44ba6337aac2025 Mon Sep 17 00:00:00 2001
From: AllentDan <41138331+AllentDan@users.noreply.github.com>
Date: Fri, 15 Nov 2024 00:42:28 +0800
Subject: [PATCH 13/40] Remove use_fast=True when loading tokenizer for lite
 auto_awq (#2758)

---
 lmdeploy/lite/apis/calibrate.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lmdeploy/lite/apis/calibrate.py b/lmdeploy/lite/apis/calibrate.py
index 65ecd765c7..cd5178793d 100644
--- a/lmdeploy/lite/apis/calibrate.py
+++ b/lmdeploy/lite/apis/calibrate.py
@@ -236,7 +236,6 @@ def calibrate(model: str,
     if model_type == 'llm':
         # Load tokenizer and configuration
         tokenizer = AutoTokenizer.from_pretrained(model,
-                                                  use_fast=False,
                                                   trust_remote_code=True)
 
         model = load_hf_from_pretrained(model,

From 9ecc44abeb99e672c17959b4543e041c14f2221c Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Fri, 15 Nov 2024 19:19:23 +0800
Subject: [PATCH 14/40] set wrong head_dim for mistral-nemo (#2761)

---
 lmdeploy/turbomind/deploy/source_model/llama.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lmdeploy/turbomind/deploy/source_model/llama.py b/lmdeploy/turbomind/deploy/source_model/llama.py
index a8aa51b144..0c702d6588 100644
--- a/lmdeploy/turbomind/deploy/source_model/llama.py
+++ b/lmdeploy/turbomind/deploy/source_model/llama.py
@@ -153,6 +153,7 @@ def model_info(self):
             max_position_embeddings = int(
                 model_arg.get('max_position_embeddings', 0))
             rope_scaling = model_arg.get('rope_scaling', None)
+            head_dim = model_arg.get('head_dim', hidden_units // attn_head_num)
             scaling_factor = 0.0
             use_dynamic_ntk = 0
             scaling_type = ''
@@ -189,7 +190,7 @@ def model_info(self):
                     beta_slow = rope_scaling.get('beta_slow', 1.0)
 
         return dict(
-            size_per_head=hidden_units // attn_head_num,
+            size_per_head=head_dim,
             rotary_embedding=hidden_units // attn_head_num,
             num_layer=num_layer,
             norm_eps=norm_eps,

From 0c80baa001e79d0b7d182b8a670190801d2d8d5b Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Sat, 16 Nov 2024 12:29:24 +0800
Subject: [PATCH 15/40] bump version to v0.6.3 (#2754)

* bump version to v0.6.3

* update supported models
---
 README.md                                       |  1 +
 README_ja.md                                    |  1 +
 README_zh-CN.md                                 |  1 +
 docs/en/get_started/installation.md             |  2 +-
 docs/en/multi_modal/vl_pipeline.md              | 16 +++-------------
 docs/en/supported_models/supported_models.md    |  1 +
 docs/zh_cn/get_started/installation.md          |  2 +-
 docs/zh_cn/multi_modal/vl_pipeline.md           | 16 +++-------------
 docs/zh_cn/supported_models/supported_models.md |  1 +
 lmdeploy/version.py                             |  2 +-
 10 files changed, 14 insertions(+), 29 deletions(-)

diff --git a/README.md b/README.md
index 5b6ad47bdf..d160338aa6 100644
--- a/README.md
+++ b/README.md
@@ -167,6 +167,7 @@ For detailed inference benchmarks in more devices and more settings, please refe
   <li>Phi-3.5-vision (4.2B)</li>
   <li>GLM-4V (9B)</li>
   <li>Llama3.2-vision (11B, 90B)</li>
+  <li>Molmo (7B-D,72B)</li>
 </ul>
 </td>
 </tr>
diff --git a/README_ja.md b/README_ja.md
index bdd9ddb02d..fda176229e 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -163,6 +163,7 @@ LMDeploy TurboMindエンジンは卓越した推論能力を持ち、さまざ
   <li>Phi-3.5-vision (4.2B)</li>
   <li>GLM-4V (9B)</li>
   <li>Llama3.2-vision (11B, 90B)</li>
+  <li>Molmo (7B-D,72B)</li>
 </ul>
 </td>
 </tr>
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 550922d081..6c24b2e500 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -168,6 +168,7 @@ LMDeploy TurboMind 引擎拥有卓越的推理能力，在各种规模的模型
   <li>Phi-3.5-vision (4.2B)</li>
   <li>GLM-4V (9B)</li>
   <li>Llama3.2-vision (11B, 90B)</li>
+  <li>Molmo (7B-D,72B)</li>
 </ul>
 </td>
 </tr>
diff --git a/docs/en/get_started/installation.md b/docs/en/get_started/installation.md
index b7d03b28a6..b3e8bb8abd 100644
--- a/docs/en/get_started/installation.md
+++ b/docs/en/get_started/installation.md
@@ -23,7 +23,7 @@ pip install lmdeploy
 The default prebuilt package is compiled on **CUDA 12**. If CUDA 11+ (>=11.3) is required, you can install lmdeploy by:
 
 ```shell
-export LMDEPLOY_VERSION=0.6.2
+export LMDEPLOY_VERSION=0.6.3
 export PYTHON_VERSION=38
 pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
 ```
diff --git a/docs/en/multi_modal/vl_pipeline.md b/docs/en/multi_modal/vl_pipeline.md
index 4881b99071..9632c9e6df 100644
--- a/docs/en/multi_modal/vl_pipeline.md
+++ b/docs/en/multi_modal/vl_pipeline.md
@@ -2,24 +2,14 @@
 
 LMDeploy abstracts the complex inference process of multi-modal Vision-Language Models (VLM) into an easy-to-use pipeline, similar to the Large Language Model (LLM) inference [pipeline](../llm/pipeline.md).
 
-Currently, it supports the following models.
-
-- [Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)
-- LLaVA series: [v1.5](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e), [v1.6](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2)
-- [Yi-VL](https://huggingface.co/01-ai/Yi-VL-6B)
-- [DeepSeek-VL](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)
-- [InternVL](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5)
-- [Mono-InternVL](https://huggingface.co/OpenGVLab/Mono-InternVL-2B)
-- [MGM](https://huggingface.co/YanweiLi/MGM-7B)
-- [XComposer](https://huggingface.co/internlm/internlm-xcomposer2-vl-7b)
-- [CogVLM](https://github.com/InternLM/lmdeploy/tree/main/docs/en/multi_modal/cogvlm.md)
-
-We genuinely invite the community to contribute new VLM support to LMDeploy. Your involvement is truly appreciated.
+The supported models are listed [here](../supported_models/supported_models.md). We genuinely invite the community to contribute new VLM support to LMDeploy. Your involvement is truly appreciated.
 
 This article showcases the VLM pipeline using the [liuhaotian/llava-v1.6-vicuna-7b](https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b) model as a case study.
 You'll learn about the simplest ways to leverage the pipeline and how to gradually unlock more advanced features by adjusting engine parameters and generation arguments, such as tensor parallelism, context window sizing, random sampling, and chat template customization.
 Moreover, we will provide practical inference examples tailored to scenarios with multiple images, batch prompts etc.
 
+Using the pipeline interface to infer other VLM models is similar, with the main difference being the configuration and installation dependencies of the models. You can read [here](https://lmdeploy.readthedocs.io/en/latest/multi_modal/index.html) for environment installation and configuration methods for different models.
+
 ## A 'Hello, world' example
 
 ```python
diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
index 90ca90388b..a122f10ec8 100644
--- a/docs/en/supported_models/supported_models.md
+++ b/docs/en/supported_models/supported_models.md
@@ -36,6 +36,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |    MiniGeminiLlama    |      7B      | MLLM |    Yes    |    -    |    -    |  Yes  |
 |         GLM4          |      9B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |       CodeGeeX4       |      9B      | LLM  |    Yes    |   Yes   |   Yes   |   -   |
+|         Molmo         |   7B-D,72B   | MLLM |    Yes    |   Yes   |   Yes   |  NO   |
 
 "-" means not verified yet.
 
diff --git a/docs/zh_cn/get_started/installation.md b/docs/zh_cn/get_started/installation.md
index 3108d64815..12562c51d5 100644
--- a/docs/zh_cn/get_started/installation.md
+++ b/docs/zh_cn/get_started/installation.md
@@ -23,7 +23,7 @@ pip install lmdeploy
 默认的预构建包是在 **CUDA 12** 上编译的。如果需要 CUDA 11+ (>=11.3)，你可以使用以下命令安装 lmdeploy：
 
 ```shell
-export LMDEPLOY_VERSION=0.6.2
+export LMDEPLOY_VERSION=0.6.3
 export PYTHON_VERSION=38
 pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
 ```
diff --git a/docs/zh_cn/multi_modal/vl_pipeline.md b/docs/zh_cn/multi_modal/vl_pipeline.md
index 570598311a..35f647e36c 100644
--- a/docs/zh_cn/multi_modal/vl_pipeline.md
+++ b/docs/zh_cn/multi_modal/vl_pipeline.md
@@ -2,24 +2,14 @@
 
 LMDeploy 把视觉-语言模型（VLM）复杂的推理过程，抽象为简单好用的 pipeline。它的用法与大语言模型（LLM）推理 [pipeline](../llm/pipeline.md) 类似。
 
-目前，VLM pipeline 支持以下模型：
-
-- [Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)
-- LLaVA series: [v1.5](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e), [v1.6](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2)
-- [Yi-VL](https://huggingface.co/01-ai/Yi-VL-6B)
-- [DeepSeek-VL](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)
-- [InternVL](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5)
-- [Mono-InternVL](https://huggingface.co/OpenGVLab/Mono-InternVL-2B)
-- [MGM](https://huggingface.co/YanweiLi/MGM-7B)
-- [XComposer](https://huggingface.co/internlm/internlm-xcomposer2-vl-7b)
-- [CogVLM](https://github.com/InternLM/lmdeploy/tree/main/docs/zh_cn/multi_modal/cogvlm.md)
-
-我们诚挚邀请社区在 LMDeploy 中添加更多 VLM 模型的支持。
+在[这个列表中](../supported_models/supported_models.md)，你可以查阅每个推理引擎支持的 VLM 模型。我们诚挚邀请社区在 LMDeploy 中添加更多 VLM 模型。
 
 本文将以 [liuhaotian/llava-v1.6-vicuna-7b](https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b) 模型为例，展示 VLM pipeline 的用法。你将了解它的最基础用法，以及如何通过调整引擎参数和生成条件来逐步解锁更多高级特性，如张量并行，上下文窗口大小调整，随机采样，以及对话模板的定制。
 
 此外，我们还提供针对多图、批量提示词等场景的实际推理示例。
 
+使用 pipeline 接口推理其他 VLM 模型，大同小异，主要区别在于模型依赖的配置和安装。你可以阅读[此处](https://lmdeploy.readthedocs.io/zh-cn/latest/multi_modal/)，查看不同模型的环境安装和配置方式
+
 ## "Hello, world" 示例
 
 ```python
diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
index fecfdee200..f3ffd4311d 100644
--- a/docs/zh_cn/supported_models/supported_models.md
+++ b/docs/zh_cn/supported_models/supported_models.md
@@ -36,6 +36,7 @@
 |    MiniGeminiLlama    |      7B      | MLLM |    Yes    |    -    |    -    |  Yes  |
 |         GLM4          |      9B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |       CodeGeeX4       |      9B      | LLM  |    Yes    |   Yes   |   Yes   |   -   |
+|         Molmo         |   7B-D,72B   | MLLM |    Yes    |   Yes   |   Yes   |  NO   |
 
 “-” 表示还没有验证。
 
diff --git a/lmdeploy/version.py b/lmdeploy/version.py
index b9f76b5761..d9f4307a78 100644
--- a/lmdeploy/version.py
+++ b/lmdeploy/version.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from typing import Tuple
 
-__version__ = '0.6.2'
+__version__ = '0.6.3'
 short_version = __version__
 
 

From 96fa66846e7af4d5e321c7f2694612d269c2c10c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=80=9D=E5=A4=9C=E9=95=BF=E6=AD=8C?= <928926035@qq.com>
Date: Tue, 19 Nov 2024 11:21:58 +0800
Subject: [PATCH 16/40] feature: support qwen2.5 fuction_call (#2737)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat: support qwen2.5 tools_call

* fix: npe bug

* fix: 模版不一致

* fix: adopting review suggestions

* fix: adopting review suggestions

* fix: adopting review suggestions

* fix: adopting review suggestions

* feat: Support multi tools calling

* feat: Support multi tools calling

* fix: Add '\n' between each tool

* fix: Add ensure_ascii=False

* bugfix: rfind

* bugfix: tools_call -> tool_calls

* bugfix: add toolName in tool_response

* fix: some '\n' error

* fix: remove toolname

* fix: replace '\n' to self.separator

* feat: add doc with multiple tool calling

* fix：update doc

* feat: add qwen2.5 prompt template test

* feat: add qwen2.5 no tool call prompt test

---------

Co-authored-by: gaozixiang <gaozixiang1@xiaomi.com>
---
 docs/en/llm/api_server_tools.md     | 155 ++++++++++++++-
 docs/zh_cn/llm/api_server_tools.md  | 155 ++++++++++++++-
 lmdeploy/model.py                   | 110 ++++++++++-
 lmdeploy/serve/async_engine.py      |  28 ++-
 lmdeploy/serve/openai/api_server.py |  13 +-
 tests/test_lmdeploy/test_model.py   | 286 ++++++++++++++++++++++++++++
 6 files changed, 736 insertions(+), 11 deletions(-)

diff --git a/docs/en/llm/api_server_tools.md b/docs/en/llm/api_server_tools.md
index 56fb1b598a..39e91dbf07 100644
--- a/docs/en/llm/api_server_tools.md
+++ b/docs/en/llm/api_server_tools.md
@@ -1,6 +1,6 @@
 # Tools Calling
 
-LMDeploy supports tools for InternLM2, InternLM2.5 and llama3.1 models.
+LMDeploy supports tools for InternLM2, InternLM2.5, llama3.1 and Qwen2.5 models.
 
 ## Single Round Invocation
 
@@ -241,3 +241,156 @@ messages += [
 assistant_response = request_llama3_1_service(messages)
 print(assistant_response)
 ```
+
+### Qwen2.5
+
+Qwen2.5 supports multi tool calling, which means that multiple tool requests can be initiated in one request
+
+```python
+from openai import OpenAI
+import json
+
+def get_current_temperature(location: str, unit: str = "celsius"):
+    """Get current temperature at a location.
+
+    Args:
+        location: The location to get the temperature for, in the format "City, State, Country".
+        unit: The unit to return the temperature in. Defaults to "celsius". (choices: ["celsius", "fahrenheit"])
+
+    Returns:
+        the temperature, the location, and the unit in a dict
+    """
+    return {
+        "temperature": 26.1,
+        "location": location,
+        "unit": unit,
+    }
+
+
+def get_temperature_date(location: str, date: str, unit: str = "celsius"):
+    """Get temperature at a location and date.
+
+    Args:
+        location: The location to get the temperature for, in the format "City, State, Country".
+        date: The date to get the temperature for, in the format "Year-Month-Day".
+        unit: The unit to return the temperature in. Defaults to "celsius". (choices: ["celsius", "fahrenheit"])
+
+    Returns:
+        the temperature, the location, the date and the unit in a dict
+    """
+    return {
+        "temperature": 25.9,
+        "location": location,
+        "date": date,
+        "unit": unit,
+    }
+
+def get_function_by_name(name):
+    if name == "get_current_temperature":
+        return get_current_temperature
+    if name == "get_temperature_date":
+        return get_temperature_date
+
+tools = [{
+    'type': 'function',
+    'function': {
+        'name': 'get_current_temperature',
+        'description': 'Get current temperature at a location.',
+        'parameters': {
+            'type': 'object',
+            'properties': {
+                'location': {
+                    'type': 'string',
+                    'description': 'The location to get the temperature for, in the format \'City, State, Country\'.'
+                },
+                'unit': {
+                    'type': 'string',
+                    'enum': [
+                        'celsius',
+                        'fahrenheit'
+                    ],
+                    'description': 'The unit to return the temperature in. Defaults to \'celsius\'.'
+                }
+            },
+            'required': [
+                'location'
+            ]
+        }
+    }
+}, {
+    'type': 'function',
+    'function': {
+        'name': 'get_temperature_date',
+        'description': 'Get temperature at a location and date.',
+        'parameters': {
+            'type': 'object',
+            'properties': {
+                'location': {
+                    'type': 'string',
+                    'description': 'The location to get the temperature for, in the format \'City, State, Country\'.'
+                },
+                'date': {
+                    'type': 'string',
+                    'description': 'The date to get the temperature for, in the format \'Year-Month-Day\'.'
+                },
+                'unit': {
+                    'type': 'string',
+                    'enum': [
+                        'celsius',
+                        'fahrenheit'
+                    ],
+                    'description': 'The unit to return the temperature in. Defaults to \'celsius\'.'
+                }
+            },
+            'required': [
+                'location',
+                'date'
+            ]
+        }
+    }
+}]
+messages = [{'role': 'user', 'content': 'Today is 2024-11-14, What\'s the temperature in San Francisco now? How about tomorrow?'}]
+
+client = OpenAI(api_key='YOUR_API_KEY', base_url='http://0.0.0.0:23333/v1')
+model_name = client.models.list().data[0].id
+response = client.chat.completions.create(
+    model=model_name,
+    messages=messages,
+    temperature=0.8,
+    top_p=0.8,
+    stream=False,
+    tools=tools)
+print(response.choices[0].message.tool_calls)
+messages.append(response.choices[0].message)
+
+for tool_call in response.choices[0].message.tool_calls:
+    tool_call_args = json.loads(tool_call.function.arguments)
+    tool_call_result =  get_function_by_name(tool_call.function.name)(**tool_call_args)
+    messages.append({
+        'role': 'tool',
+        'name': tool_call.function.name,
+        'content': tool_call_result,
+        'tool_call_id': tool_call.id
+    })
+
+response = client.chat.completions.create(
+    model=model_name,
+    messages=messages,
+    temperature=0.8,
+    top_p=0.8,
+    stream=False,
+    tools=tools)
+print(response.choices[0].message.content)
+
+```
+
+Using the Qwen2.5-14B-Instruct, similar results can be obtained as follows
+
+```
+[ChatCompletionMessageToolCall(id='0', function=Function(arguments='{"location": "San Francisco, California, USA"}', name='get_current_temperature'), type='function'),
+ ChatCompletionMessageToolCall(id='1', function=Function(arguments='{"location": "San Francisco, California, USA", "date": "2024-11-15"}', name='get_temperature_date'), type='function')]
+
+The current temperature in San Francisco, California, USA is 26.1°C. For tomorrow, 2024-11-15, the temperature is expected to be 25.9°C.
+```
+
+It is important to note that in scenarios involving multiple tool calls, the order of the tool call results can affect the response quality. The tool_call_id has not been correctly provided to the LLM.
diff --git a/docs/zh_cn/llm/api_server_tools.md b/docs/zh_cn/llm/api_server_tools.md
index 643a39d5d2..8688ea35cd 100644
--- a/docs/zh_cn/llm/api_server_tools.md
+++ b/docs/zh_cn/llm/api_server_tools.md
@@ -1,6 +1,6 @@
 # Tools
 
-LMDeploy 支持 InternLM2, InternLM2.5 和 Llama3.1 模型的工具调用。
+LMDeploy 支持 InternLM2, InternLM2.5, Llama3.1 和 Qwen2.5模型的工具调用。
 
 ## 单轮调用
 
@@ -241,3 +241,156 @@ messages += [
 assistant_response = request_llama3_1_service(messages)
 print(assistant_response)
 ```
+
+### Qwen2.5
+
+Qwen2.5 支持了多工具调用，这意味着可以在一次请求中可能发起多个工具请求
+
+```python
+from openai import OpenAI
+import json
+
+def get_current_temperature(location: str, unit: str = "celsius"):
+    """Get current temperature at a location.
+
+    Args:
+        location: The location to get the temperature for, in the format "City, State, Country".
+        unit: The unit to return the temperature in. Defaults to "celsius". (choices: ["celsius", "fahrenheit"])
+
+    Returns:
+        the temperature, the location, and the unit in a dict
+    """
+    return {
+        "temperature": 26.1,
+        "location": location,
+        "unit": unit,
+    }
+
+
+def get_temperature_date(location: str, date: str, unit: str = "celsius"):
+    """Get temperature at a location and date.
+
+    Args:
+        location: The location to get the temperature for, in the format "City, State, Country".
+        date: The date to get the temperature for, in the format "Year-Month-Day".
+        unit: The unit to return the temperature in. Defaults to "celsius". (choices: ["celsius", "fahrenheit"])
+
+    Returns:
+        the temperature, the location, the date and the unit in a dict
+    """
+    return {
+        "temperature": 25.9,
+        "location": location,
+        "date": date,
+        "unit": unit,
+    }
+
+def get_function_by_name(name):
+    if name == "get_current_temperature":
+        return get_current_temperature
+    if name == "get_temperature_date":
+        return get_temperature_date
+
+tools = [{
+    'type': 'function',
+    'function': {
+        'name': 'get_current_temperature',
+        'description': 'Get current temperature at a location.',
+        'parameters': {
+            'type': 'object',
+            'properties': {
+                'location': {
+                    'type': 'string',
+                    'description': 'The location to get the temperature for, in the format \'City, State, Country\'.'
+                },
+                'unit': {
+                    'type': 'string',
+                    'enum': [
+                        'celsius',
+                        'fahrenheit'
+                    ],
+                    'description': 'The unit to return the temperature in. Defaults to \'celsius\'.'
+                }
+            },
+            'required': [
+                'location'
+            ]
+        }
+    }
+}, {
+    'type': 'function',
+    'function': {
+        'name': 'get_temperature_date',
+        'description': 'Get temperature at a location and date.',
+        'parameters': {
+            'type': 'object',
+            'properties': {
+                'location': {
+                    'type': 'string',
+                    'description': 'The location to get the temperature for, in the format \'City, State, Country\'.'
+                },
+                'date': {
+                    'type': 'string',
+                    'description': 'The date to get the temperature for, in the format \'Year-Month-Day\'.'
+                },
+                'unit': {
+                    'type': 'string',
+                    'enum': [
+                        'celsius',
+                        'fahrenheit'
+                    ],
+                    'description': 'The unit to return the temperature in. Defaults to \'celsius\'.'
+                }
+            },
+            'required': [
+                'location',
+                'date'
+            ]
+        }
+    }
+}]
+messages = [{'role': 'user', 'content': 'Today is 2024-11-14, What\'s the temperature in San Francisco now? How about tomorrow?'}]
+
+client = OpenAI(api_key='YOUR_API_KEY', base_url='http://0.0.0.0:23333/v1')
+model_name = client.models.list().data[0].id
+response = client.chat.completions.create(
+    model=model_name,
+    messages=messages,
+    temperature=0.8,
+    top_p=0.8,
+    stream=False,
+    tools=tools)
+print(response.choices[0].message.tool_calls)
+messages.append(response.choices[0].message)
+
+for tool_call in response.choices[0].message.tool_calls:
+    tool_call_args = json.loads(tool_call.function.arguments)
+    tool_call_result =  get_function_by_name(tool_call.function.name)(**tool_call_args)
+    messages.append({
+        'role': 'tool',
+        'name': tool_call.function.name,
+        'content': tool_call_result,
+        'tool_call_id': tool_call.id
+    })
+
+response = client.chat.completions.create(
+    model=model_name,
+    messages=messages,
+    temperature=0.8,
+    top_p=0.8,
+    stream=False,
+    tools=tools)
+print(response.choices[0].message.content)
+
+```
+
+使用Qwen2.5-14B-Instruct，可以得到以下类似结果
+
+```
+[ChatCompletionMessageToolCall(id='0', function=Function(arguments='{"location": "San Francisco, California, USA"}', name='get_current_temperature'), type='function'),
+ ChatCompletionMessageToolCall(id='1', function=Function(arguments='{"location": "San Francisco, California, USA", "date": "2024-11-15"}', name='get_temperature_date'), type='function')]
+
+The current temperature in San Francisco, California, USA is 26.1°C. For tomorrow, 2024-11-15, the temperature is expected to be 25.9°C.
+```
+
+需要注意的是，多工具调用的情况下，工具调用的结果顺序会影响回答的效果，tool_call_id并没有正确给到LLM.
diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index c9eb71c2c3..47aaaa4e88 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -944,7 +944,8 @@ def match(cls, model_path: str) -> Optional[str]:
         Args:
             model_path (str): the model path used for matching.
         """
-        if 'qwen' in model_path.lower():
+        if 'qwen' in model_path.lower() and 'qwen2.5' not in model_path.lower(
+        ):
             return 'qwen'
         if 'minicpm-v-2_6' in model_path.lower():
             return 'minicpmv-2d6'
@@ -952,6 +953,113 @@ def match(cls, model_path: str) -> Optional[str]:
             return 'minicpm3'
 
 
+@MODELS.register_module(name='qwen2d5')
+class Qwen2d5Chat(Qwen7BChat):
+    """Chat template for Qwen2.5-Instruct series."""
+
+    def __init__(
+            self,
+            system='<|im_start|>system\n',
+            meta_instruction='You are Qwen, created by Alibaba Cloud. You are a helpful assistant.',
+            eosys='<|im_end|>\n',
+            user='<|im_start|>user\n',
+            eoh='<|im_end|>\n',
+            assistant='<|im_start|>assistant\n',
+            eoa='<|im_end|>',
+            separator='\n',
+            tools="""\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>""",
+            eotools="""\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{"name": <function-name>, "arguments": <args-json-object>}\n</tool_call>""",
+            stop_words=['<|im_end|>'],
+            **kwargs):
+
+        self.tools = tools
+        self.eotools = eotools
+        super().__init__(system=system,
+                         meta_instruction=meta_instruction,
+                         eosys=eosys,
+                         user=user,
+                         eoh=eoh,
+                         assistant=assistant,
+                         eoa=eoa,
+                         separator=separator,
+                         stop_words=stop_words,
+                         **kwargs)
+
+    def messages2prompt(self,
+                        messages,
+                        sequence_start=True,
+                        tools=None,
+                        **kwargs):
+        """Return the prompt that is concatenated with other elements in the
+        chat template.
+
+        Args:
+            messages (str | List): user's input prompt
+        Returns:
+            str: the concatenated prompt
+        """
+        if isinstance(messages, str):
+            return self.get_prompt(messages, sequence_start)
+        box_map = dict(user=self.user,
+                       assistant=self.assistant,
+                       system=self.system)
+        ret = ''
+        tool_prompt = ''
+        if tools is not None and len(tools) > 0:
+            for tool in tools:
+                tool_prompt += self.separator
+                tool_prompt += f'{{"type": "function", "function": {json.dumps(tool, ensure_ascii=False)}}}'
+            if len(messages) and messages[0]['role'] == 'system':
+                ret += f"{self.system}{messages[0]['content']}{self.tools}{tool_prompt}{self.eotools}{self.eosys}"
+            else:
+                ret += f'{self.system}{self.meta_instruction}{self.tools}{tool_prompt}{self.eotools}{self.eosys}'
+        else:
+            if self.meta_instruction is not None and sequence_start:
+                if len(messages) and messages[0]['role'] == 'system':
+                    ret += f"{self.system}{messages[0]['content']}{self.eosys}"
+                else:
+                    ret += f'{self.system}{self.meta_instruction}{self.eosys}'
+
+        for index, message in enumerate(messages):
+            if (message['role'] == 'user'
+                    or (message['role'] == 'system' and index != 0)
+                    or (message['role'] == 'assistant'
+                        and message.get('tool_calls') is None)):
+                ret += f"{box_map[message['role']]}{message['content']}{self.eosys}"
+            elif message['role'] == 'assistant':
+                ret += f'<|im_start|>assistant'
+                if message.get('content') is not None:
+                    ret += f"{self.separator}{message['content']}"
+
+                if message.get('tool_calls') is not None:
+                    tool_calls = message['tool_calls']
+                    for tool_call in tool_calls:
+                        if tool_call.get('function') is not None:
+                            tool_call = tool_call['function']
+                        ret += f'{self.separator}<tool_call>{self.separator}{{"name": "{tool_call["name"]}", "arguments": {json.dumps(tool_call["arguments"], ensure_ascii=False)}}}{self.separator}</tool_call>'
+                ret += self.eosys
+            if message['role'] == 'tool':
+                if index == 0 or messages[index - 1]['role'] != 'tool':
+                    ret += f'<|im_start|>user'
+                ret += f"{self.separator}<tool_response>{self.separator}{message['content']}{self.separator}</tool_response>"
+                if index == len(messages) - 1 or messages[index +
+                                                          1]['role'] != 'tool':
+                    ret += f'{self.eoh}'
+        ret += f'{self.assistant}'
+        return ret
+
+    @classmethod
+    def match(cls, model_path: str) -> Optional[str]:
+        """Return the model_name that was registered to MODELS.
+
+        Args:
+            model_path (str): the model path used for matching.
+        """
+        lower_path = model_path.lower()
+        if 'qwen2.5' in lower_path or 'qwen2_5' in lower_path:
+            return 'qwen2d5'
+
+
 @MODELS.register_module(name='codellama')
 class CodeLlama(Llama2):
 
diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py
index 3c8f193cd5..f3c3432328 100644
--- a/lmdeploy/serve/async_engine.py
+++ b/lmdeploy/serve/async_engine.py
@@ -4,6 +4,7 @@
 import json
 import os
 import random
+import re
 from contextlib import asynccontextmanager
 from copy import deepcopy
 from itertools import count
@@ -648,14 +649,37 @@ def parse_tool_response(self, text, tools, **kwargs):
             name, parameters = action['name'], json.dumps(action.get(
                 'parameters', action.get('arguments', {})),
                                                           ensure_ascii=False)
+            call_info_list = [(name, parameters)]
         elif '<function=' in text:  # llama3.1
             action, _ = text.split('</function>')
             parameters = action[action.find('{'):]
             name = action.split('<function=')[1].split('>{')[0]
+            call_info_list = [(name, parameters)]
+        elif '<tool_call>' in text and '</tool_call>' in text:  # qwen2.5
+            # get tool_call in text
+            pattern = r'<tool_call>(.*?)</tool_call>'
+            match_result_list = re.findall(pattern, text, re.DOTALL)
+            call_info_list = []
+            for match_result in match_result_list:
+                action = json.loads(match_result)
+                call_info_list.append((action['name'],
+                                       json.dumps(action['arguments'],
+                                                  ensure_ascii=False)))
+            # get text outside of tags
+            if not text.startswith('<tool_call>'):
+                text = text[:text.find('<tool_call>')]
+            elif not text.endswith('</tool_call>'):
+                text = text[text.rfind('</tool_call>') + len('</tool_call>'):]
+            else:
+                text = ''
+
         else:
             raise RuntimeError(f'Unexpected model response: {text}')
-        action_id = [tool.function.name for tool in tools].index(name)
-        return text, action_id, name, parameters
+
+        call_info_list = [([tool.function.name for tool in tools
+                            ].index(call_info[0]), call_info[0], call_info[1])
+                          for call_info in call_info_list]
+        return text, call_info_list
 
     def chat(self,
              prompt: str,
diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
index a12cadaa7d..2d0560720d 100644
--- a/lmdeploy/serve/openai/api_server.py
+++ b/lmdeploy/serve/openai/api_server.py
@@ -495,17 +495,18 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
             final_logprobs.extend(res.logprobs)
 
     tool_calls = None
-    if request.tool_choice != 'none' and ('<|plugin|>' in text
-                                          or '<function=' in text):
+    if request.tool_choice != 'none' and ('<|plugin|>' in text or '<function='
+                                          in text or '<tool_call>' in text):
         if final_res.finish_reason == 'stop':
             final_res.finish_reason = 'tool_calls'
         try:  # TODO add json_schema guidance to turbomind
-            text, action_id, name, parameters = VariableInterface.async_engine.parse_tool_response(  # noqa
+            text, call_info_list = VariableInterface.async_engine.parse_tool_response(  # noqa
                 text, request.tools)
             tool_calls = [
-                ToolCall(id=str(action_id),
-                         function=FunctionResponse(name=name,
-                                                   arguments=parameters))
+                ToolCall(id=str(call_info[0]),
+                         function=FunctionResponse(name=call_info[1],
+                                                   arguments=call_info[2]))
+                for call_info in call_info_list
             ]
         except Exception as e:
             logger.error(f'Exception: {e}')
diff --git a/tests/test_lmdeploy/test_model.py b/tests/test_lmdeploy/test_model.py
index 7e3e71793d..3b78053a74 100644
--- a/tests/test_lmdeploy/test_model.py
+++ b/tests/test_lmdeploy/test_model.py
@@ -9,6 +9,7 @@
      ('internlm/internlm2-1_8b', ['base']),
      ('models--internlm--internlm-chat-7b/snapshots/1234567', ['internlm']),
      ('Qwen/Qwen-7B-Chat', ['qwen']),
+     ('Qwen/Qwen2.5-7B-Instruct', ['qwen2d5']),
      ('codellama/CodeLlama-7b-hf', ['codellama']),
      ('upstage/SOLAR-0-70b', ['solar', 'solar-70b']),
      ('meta-llama/Llama-2-7b-chat-hf', ['llama2']),
@@ -283,6 +284,291 @@ def test_qwen():
         assert _prompt is None
 
 
+def test_qwen2d5():
+    prompt = 'hello, can u introduce yourself'
+    model = MODELS.get('qwen2d5')(capability='completion')
+    assert model.get_prompt(prompt, sequence_start=True) == prompt
+    assert model.get_prompt(prompt, sequence_start=False) == prompt
+
+    model = MODELS.get('qwen2d5')(capability='chat')
+
+    # No tool call
+    messages = [
+        dict(role='user',
+             content='What\'s the temperature in San Francisco now?')
+    ]
+    no_tool_prompt = ('<|im_start|>system\nYou are Qwen, created by Alibaba '
+                      'Cloud. You are a helpful '
+                      "assistant.<|im_end|>\n<|im_start|>user\nWhat's the "
+                      'temperature in San Francisco '
+                      'now?<|im_end|>\n<|im_start|>assistant\n')
+    assert model.messages2prompt(messages) == no_tool_prompt
+    assert model.messages2prompt(messages, tools=[]) == no_tool_prompt
+
+    messages.append({'role': 'assistant', 'content': 'I don\'t know.'})
+    no_tool_prompt = ('<|im_start|>system\nYou are Qwen, created by Alibaba '
+                      'Cloud. You are a helpful '
+                      "assistant.<|im_end|>\n<|im_start|>user\nWhat's the "
+                      'temperature in San Francisco '
+                      "now?<|im_end|>\n<|im_start|>assistant\nI don't "
+                      'know.<|im_end|>\n<|im_start|>assistant\n')
+    assert model.messages2prompt(messages) == no_tool_prompt
+    # Single tool call
+    tools = [{
+        'name': 'get_current_temperature',
+        'description': 'Get current temperature at a location.',
+        'parameters': {
+            'type': 'object',
+            'properties': {
+                'location': {
+                    'type':
+                    'string',
+                    'description':
+                    'The location to get the temperature for,'
+                    ' in the format \'City, State, Country\'.'
+                },
+                'unit': {
+                    'type':
+                    'string',
+                    'enum': ['celsius', 'fahrenheit'],
+                    'description':
+                    'The unit to return the temperature in. Defaults to '
+                    '\'celsius\'.'
+                }
+            },
+            'required': ['location']
+        }
+    }]
+
+    messages = [
+        dict(role='user',
+             content='What\'s the temperature in San Francisco now?')
+    ]
+    tool_prompt = ('<|im_start|>system\nYou are Qwen, created by Alibaba '
+                   'Cloud. You are a helpful assistant.\n\n# Tools\n\nYou '
+                   'may call one or more functions to assist with the user '
+                   'query.\n\nYou are provided with function signatures '
+                   "within <tools></tools> XML tags:\n<tools>\n{\"type\": "
+                   "\"function\", \"function\": {\"name\": "
+                   "\"get_current_temperature\", \"description\": \"Get "
+                   "current temperature at a location.\", \"parameters\": {"
+                   "\"type\": \"object\", \"properties\": {\"location\": {"
+                   "\"type\": \"string\", \"description\": \"The location to "
+                   "get the temperature for, in the format 'City, State, "
+                   "Country'.\"}, \"unit\": {\"type\": \"string\", \"enum\": "
+                   "[\"celsius\", \"fahrenheit\"], \"description\": \"The "
+                   'unit to return the temperature in. Defaults to '
+                   "'celsius'.\"}}, \"required\": ["
+                   "\"location\"]}}}\n</tools>\n\nFor each function call, "
+                   'return a json object with function name and arguments '
+                   'within <tool_call></tool_call> XML tags:\n<tool_call>\n{'
+                   "\"name\": <function-name>, \"arguments\": "
+                   '<args-json-object>}\n</tool_call><|im_end|>\n<|im_start'
+                   "|>user\nWhat's the temperature in San Francisco "
+                   'now?<|im_end|>\n<|im_start|>assistant\n')
+    assert model.messages2prompt(messages, tools=tools) == tool_prompt
+
+    messages.append(
+        dict(role='tool',
+             name='get_current_temperature',
+             content={
+                 'temperature': 26.1,
+                 'location': 'San Francisco, California, USA',
+                 'unit': 'celsius'
+             },
+             tool_call_id='0'))
+    tool_prompt = ('<|im_start|>system\nYou are Qwen, created by Alibaba '
+                   'Cloud. You are a helpful assistant.\n\n# Tools\n\nYou '
+                   'may call one or more functions to assist with the user '
+                   'query.\n\nYou are provided with function signatures '
+                   "within <tools></tools> XML tags:\n<tools>\n{\"type\": "
+                   "\"function\", \"function\": {\"name\": "
+                   "\"get_current_temperature\", \"description\": \"Get "
+                   "current temperature at a location.\", \"parameters\": {"
+                   "\"type\": \"object\", \"properties\": {\"location\": {"
+                   "\"type\": \"string\", \"description\": \"The location to "
+                   "get the temperature for, in the format 'City, State, "
+                   "Country'.\"}, \"unit\": {\"type\": \"string\", \"enum\": "
+                   "[\"celsius\", \"fahrenheit\"], \"description\": \"The "
+                   'unit to return the temperature in. Defaults to '
+                   "'celsius'.\"}}, \"required\": ["
+                   "\"location\"]}}}\n</tools>\n\nFor each function call, "
+                   'return a json object with function name and arguments '
+                   'within <tool_call></tool_call> XML tags:\n<tool_call>\n{'
+                   "\"name\": <function-name>, \"arguments\": "
+                   '<args-json-object>}\n</tool_call><|im_end|>\n<|im_start'
+                   "|>user\nWhat's the temperature in San Francisco "
+                   'now?<|im_end|>\n<|im_start|>user\n<tool_response>\n{'
+                   "'temperature': 26.1, 'location': 'San Francisco, "
+                   "California, USA', 'unit': "
+                   "'celsius'}\n</tool_response><|im_end|>\n<|im_start"
+                   '|>assistant\n')
+    assert model.messages2prompt(messages, tools=tools) == tool_prompt
+    # Multi tool calling
+    tools = [{
+        'name': 'get_current_temperature',
+        'description': 'Get current temperature at a location.',
+        'parameters': {
+            'type': 'object',
+            'properties': {
+                'location': {
+                    'type':
+                    'string',
+                    'description':
+                    'The location to get the temperature for, in the format '
+                    '\'City, State, Country\'.'
+                },
+                'unit': {
+                    'type':
+                    'string',
+                    'enum': ['celsius', 'fahrenheit'],
+                    'description':
+                    'The unit to return the temperature in.'
+                    ' Defaults to \'celsius\'.'
+                }
+            },
+            'required': ['location']
+        }
+    }, {
+        'name': 'get_temperature_date',
+        'description': 'Get temperature at a location and date.',
+        'parameters': {
+            'type': 'object',
+            'properties': {
+                'location': {
+                    'type':
+                    'string',
+                    'description':
+                    'The location to get the temperature for,'
+                    ' in the format \'City, State, Country\'.'
+                },
+                'date': {
+                    'type':
+                    'string',
+                    'description':
+                    'The date to get the temperature for,'
+                    ' in the format \'Year-Month-Day\'.'
+                },
+                'unit': {
+                    'type':
+                    'string',
+                    'enum': ['celsius', 'fahrenheit'],
+                    'description':
+                    'The unit to return the temperature in.'
+                    ' Defaults to \'celsius\'.'
+                }
+            },
+            'required': ['location', 'date']
+        }
+    }]
+    messages = [
+        dict(role='user',
+             content='Today is 2024-11-14, What\'s the temperature in'
+             ' San Francisco now? How about tomorrow?')
+    ]
+    tool_prompt = ('<|im_start|>system\nYou are Qwen, created by Alibaba '
+                   'Cloud. You are a helpful assistant.\n\n# Tools\n\nYou '
+                   'may call one or more functions to assist with the user '
+                   'query.\n\nYou are provided with function signatures '
+                   "within <tools></tools> XML tags:\n<tools>\n{\"type\": "
+                   "\"function\", \"function\": {\"name\": "
+                   "\"get_current_temperature\", \"description\": \"Get "
+                   "current temperature at a location.\", \"parameters\": {"
+                   "\"type\": \"object\", \"properties\": {\"location\": {"
+                   "\"type\": \"string\", \"description\": \"The location to "
+                   "get the temperature for, in the format 'City, State, "
+                   "Country'.\"}, \"unit\": {\"type\": \"string\", \"enum\": "
+                   "[\"celsius\", \"fahrenheit\"], \"description\": \"The "
+                   'unit to return the temperature in. Defaults to '
+                   "'celsius'.\"}}, \"required\": [\"location\"]}}}\n{"
+                   "\"type\": \"function\", \"function\": {\"name\": "
+                   "\"get_temperature_date\", \"description\": \"Get "
+                   "temperature at a location and date.\", \"parameters\": {"
+                   "\"type\": \"object\", \"properties\": {\"location\": {"
+                   "\"type\": \"string\", \"description\": \"The location to "
+                   "get the temperature for, in the format 'City, State, "
+                   "Country'.\"}, \"date\": {\"type\": \"string\", "
+                   "\"description\": \"The date to get the temperature for, "
+                   "in the format 'Year-Month-Day'.\"}, \"unit\": {\"type\": "
+                   "\"string\", \"enum\": [\"celsius\", \"fahrenheit\"], "
+                   "\"description\": \"The unit to return the temperature "
+                   "in. Defaults to 'celsius'.\"}}, \"required\": ["
+                   "\"location\", \"date\"]}}}\n</tools>\n\nFor each "
+                   'function call, return a json object with function name '
+                   'and arguments within <tool_call></tool_call> XML '
+                   "tags:\n<tool_call>\n{\"name\": <function-name>, "
+                   "\"arguments\": "
+                   '<args-json-object>}\n</tool_call><|im_end|>\n<|im_start'
+                   "|>user\nToday is 2024-11-14, What's the temperature in "
+                   'San Francisco now? How about '
+                   'tomorrow?<|im_end|>\n<|im_start|>assistant\n')
+    assert model.messages2prompt(messages, tools=tools) == tool_prompt
+
+    messages.append(
+        dict(role='tool',
+             name='get_current_temperature',
+             content={
+                 'temperature': 26.1,
+                 'location': 'San Francisco, California, USA',
+                 'unit': 'celsius'
+             },
+             tool_call_id='0'))
+    messages.append(
+        dict(role='tool',
+             name='get_temperature_date',
+             content={
+                 'temperature': 25.9,
+                 'location': 'San Francisco, California, USA',
+                 'date': '2024-11-15',
+                 'unit': 'celsius'
+             },
+             tool_call_id='1'))
+    tool_prompt = ('<|im_start|>system\nYou are Qwen, created by Alibaba '
+                   'Cloud. You are a helpful assistant.\n\n# Tools\n\nYou '
+                   'may call one or more functions to assist with the user '
+                   'query.\n\nYou are provided with function signatures '
+                   "within <tools></tools> XML tags:\n<tools>\n{\"type\": "
+                   "\"function\", \"function\": {\"name\": "
+                   "\"get_current_temperature\", \"description\": \"Get "
+                   "current temperature at a location.\", \"parameters\": {"
+                   "\"type\": \"object\", \"properties\": {\"location\": {"
+                   "\"type\": \"string\", \"description\": \"The location to "
+                   "get the temperature for, in the format 'City, State, "
+                   "Country'.\"}, \"unit\": {\"type\": \"string\", \"enum\": "
+                   "[\"celsius\", \"fahrenheit\"], \"description\": \"The "
+                   'unit to return the temperature in. Defaults to '
+                   "'celsius'.\"}}, \"required\": [\"location\"]}}}\n{"
+                   "\"type\": \"function\", \"function\": {\"name\": "
+                   "\"get_temperature_date\", \"description\": \"Get "
+                   "temperature at a location and date.\", \"parameters\": {"
+                   "\"type\": \"object\", \"properties\": {\"location\": {"
+                   "\"type\": \"string\", \"description\": \"The location to "
+                   "get the temperature for, in the format 'City, State, "
+                   "Country'.\"}, \"date\": {\"type\": \"string\", "
+                   "\"description\": \"The date to get the temperature for, "
+                   "in the format 'Year-Month-Day'.\"}, \"unit\": {\"type\": "
+                   "\"string\", \"enum\": [\"celsius\", \"fahrenheit\"], "
+                   "\"description\": \"The unit to return the temperature "
+                   "in. Defaults to 'celsius'.\"}}, \"required\": ["
+                   "\"location\", \"date\"]}}}\n</tools>\n\nFor each "
+                   'function call, return a json object with function name '
+                   'and arguments within <tool_call></tool_call> XML '
+                   "tags:\n<tool_call>\n{\"name\": <function-name>, "
+                   "\"arguments\": "
+                   '<args-json-object>}\n</tool_call><|im_end|>\n<|im_start'
+                   "|>user\nToday is 2024-11-14, What's the temperature in "
+                   'San Francisco now? How about '
+                   'tomorrow?<|im_end|>\n<|im_start|>user\n<tool_response'
+                   ">\n{'temperature': 26.1, 'location': 'San Francisco, "
+                   "California, USA', 'unit': "
+                   "'celsius'}\n</tool_response>\n<tool_response>\n{"
+                   "'temperature': 25.9, 'location': 'San Francisco, "
+                   "California, USA', 'date': '2024-11-15', 'unit': "
+                   "'celsius'}\n</tool_response><|im_end|>\n<|im_start"
+                   '|>assistant\n')
+    assert model.messages2prompt(messages, tools=tools) == tool_prompt
+
+
 def test_codellama_completion():
     model = MODELS.get('codellama')(capability='completion')
     prompt = """\

From 0608b01f87a32a31eea4e7c579edf659a65df6b8 Mon Sep 17 00:00:00 2001
From: jinminxi104 <jinminxi104@hotmail.com>
Date: Tue, 19 Nov 2024 17:58:25 +0800
Subject: [PATCH 17/40] Update supported models & Ascend doc (#2765)

* update ascend supported model list

* fix markdown

* fix markdown

* fix lint

* Update get_started.md

* Update get_started.md
---
 docs/en/get_started/ascend/get_started.md     |  4 +++
 docs/en/supported_models/supported_models.md  | 28 ++++++++++---------
 docs/zh_cn/get_started/ascend/get_started.md  |  4 +++
 .../supported_models/supported_models.md      | 28 ++++++++++---------
 4 files changed, 38 insertions(+), 26 deletions(-)

diff --git a/docs/en/get_started/ascend/get_started.md b/docs/en/get_started/ascend/get_started.md
index 9e963b3795..23b86afa61 100644
--- a/docs/en/get_started/ascend/get_started.md
+++ b/docs/en/get_started/ascend/get_started.md
@@ -3,6 +3,8 @@
 The usage of lmdeploy on a Huawei Ascend device is almost the same as its usage on CUDA with PytorchEngine in lmdeploy.
 Please read the original [Get Started](../get_started.md) guide before reading this tutorial.
 
+Here is the [supported model list](../../supported_models/supported_models.md#PyTorchEngine-on-Huawei-Ascend-Platform).
+
 ## Installation
 
 We highly recommend that users build a Docker image for streamlined environment setup.
@@ -38,6 +40,8 @@ DOCKER_BUILDKIT=1 docker build -t lmdeploy-aarch64-ascend:latest \
     -f docker/Dockerfile_aarch64_ascend .
 ```
 
+The `Dockerfile_aarch64_ascend` is tested on Kunpeng CPU. For intel CPU, please try [this dockerfile](https://github.com/InternLM/lmdeploy/issues/2745#issuecomment-2473285703) (which is not fully tested)
+
 If the following command executes without any errors, it indicates that the environment setup is successful.
 
 ```bash
diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
index a122f10ec8..684a4f5109 100644
--- a/docs/en/supported_models/supported_models.md
+++ b/docs/en/supported_models/supported_models.md
@@ -98,16 +98,18 @@ The TurboMind engine doesn't support window attention. Therefore, for models tha
 
 ## PyTorchEngine on Huawei Ascend Platform
 
-|     Model      |   Size   | Type | FP16/BF16 | W4A16 |
-| :------------: | :------: | :--: | :-------: | :---: |
-|     Llama2     | 7B - 70B | LLM  |    Yes    |  Yes  |
-|     Llama3     |    8B    | LLM  |    Yes    |  Yes  |
-|    Llama3.1    |    8B    | LLM  |    Yes    |  Yes  |
-|   InternLM2    | 7B - 20B | LLM  |    Yes    |  Yes  |
-|  InternLM2.5   | 7B - 20B | LLM  |    Yes    |  Yes  |
-|    Mixtral     |   8x7B   | LLM  |    Yes    |  No   |
-|  QWen1.5-MoE   |  A2.7B   | LLM  |    Yes    |  No   |
-|     QWen2      |    7B    | LLM  |    Yes    |  No   |
-|   QWen2-MoE    | A14.57B  | LLM  |    Yes    |  No   |
-| InternVL(v1.5) |  2B-26B  | MLLM |    Yes    |  Yes  |
-|   InternVL2    |  1B-40B  | MLLM |    Yes    |  Yes  |
+|     Model      |   Size   | Type | FP16/BF16(eager) | FP16/BF16(graph) | W4A16(eager) |
+| :------------: | :------: | :--: | :--------------: | :--------------: | :----------: |
+|     Llama2     | 7B - 70B | LLM  |       Yes        |       Yes        |     Yes      |
+|     Llama3     |    8B    | LLM  |       Yes        |       Yes        |     Yes      |
+|    Llama3.1    |    8B    | LLM  |       Yes        |       Yes        |     Yes      |
+|   InternLM2    | 7B - 20B | LLM  |       Yes        |       Yes        |     Yes      |
+|  InternLM2.5   | 7B - 20B | LLM  |       Yes        |       Yes        |     Yes      |
+|    Mixtral     |   8x7B   | LLM  |       Yes        |       Yes        |      No      |
+|  QWen1.5-MoE   |  A2.7B   | LLM  |       Yes        |        -         |      No      |
+|   QWen2(.5)    |    7B    | LLM  |       Yes        |       Yes        |      No      |
+|   QWen2-MoE    | A14.57B  | LLM  |       Yes        |        -         |      No      |
+| InternVL(v1.5) |  2B-26B  | MLLM |       Yes        |        -         |     Yes      |
+|   InternVL2    |  1B-40B  | MLLM |       Yes        |       Yes        |     Yes      |
+|  CogVLM2-chat  |   19B    | MLLM |       Yes        |        No        |      -       |
+|     GLM4V      |    9B    | MLLM |       Yes        |        No        |      -       |
diff --git a/docs/zh_cn/get_started/ascend/get_started.md b/docs/zh_cn/get_started/ascend/get_started.md
index 046aea756b..b137c458be 100644
--- a/docs/zh_cn/get_started/ascend/get_started.md
+++ b/docs/zh_cn/get_started/ascend/get_started.md
@@ -2,6 +2,8 @@
 
 我们基于 LMDeploy 的 PytorchEngine，增加了华为昇腾设备的支持。所以，在华为昇腾上使用 LDMeploy 的方法与在英伟达 GPU 上使用 PytorchEngine 后端的方法几乎相同。在阅读本教程之前，请先阅读原版的[快速开始](../get_started.md)。
 
+支持的模型列表在[这里](../../supported_models/supported_models.md#PyTorchEngine-华为昇腾平台).
+
 ## 安装
 
 我们强烈建议用户构建一个 Docker 镜像以简化环境设置。
@@ -38,6 +40,8 @@ DOCKER_BUILDKIT=1 docker build -t lmdeploy-aarch64-ascend:latest \
     -f docker/Dockerfile_aarch64_ascend .
 ```
 
+上述`Dockerfile_aarch64_ascend`适用于鲲鹏CPU. 如果是Intel CPU的机器，请尝试[这个dockerfile](https://github.com/InternLM/lmdeploy/issues/2745#issuecomment-2473285703) (未经过测试)
+
 如果以下命令执行没有任何错误，这表明环境设置成功。
 
 ```bash
diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
index f3ffd4311d..d8bf9a1ad8 100644
--- a/docs/zh_cn/supported_models/supported_models.md
+++ b/docs/zh_cn/supported_models/supported_models.md
@@ -98,16 +98,18 @@ turbomind 引擎不支持 window attention。所以，对于应用了 window att
 
 ## PyTorchEngine 华为昇腾平台
 
-|     Model      |   Size   | Type | FP16/BF16 | W4A16 |
-| :------------: | :------: | :--: | :-------: | :---: |
-|     Llama2     | 7B - 70B | LLM  |    Yes    |  Yes  |
-|     Llama3     |    8B    | LLM  |    Yes    |  Yes  |
-|    Llama3.1    |    8B    | LLM  |    Yes    |  Yes  |
-|   InternLM2    | 7B - 20B | LLM  |    Yes    |  Yes  |
-|  InternLM2.5   | 7B - 20B | LLM  |    Yes    |  Yes  |
-|    Mixtral     |   8x7B   | LLM  |    Yes    |  No   |
-|  QWen1.5-MoE   |  A2.7B   | LLM  |    Yes    |  No   |
-|     QWen2      |    7B    | LLM  |    Yes    |  No   |
-|   QWen2-MoE    | A14.57B  | LLM  |    Yes    |  No   |
-| InternVL(v1.5) |  2B-26B  | MLLM |    Yes    |  Yes  |
-|   InternVL2    |  1B-40B  | MLLM |    Yes    |  Yes  |
+|     Model      |   Size   | Type | FP16/BF16(eager) | FP16/BF16(graph) | W4A16(eager) |
+| :------------: | :------: | :--: | :--------------: | :--------------: | :----------: |
+|     Llama2     | 7B - 70B | LLM  |       Yes        |       Yes        |     Yes      |
+|     Llama3     |    8B    | LLM  |       Yes        |       Yes        |     Yes      |
+|    Llama3.1    |    8B    | LLM  |       Yes        |       Yes        |     Yes      |
+|   InternLM2    | 7B - 20B | LLM  |       Yes        |       Yes        |     Yes      |
+|  InternLM2.5   | 7B - 20B | LLM  |       Yes        |       Yes        |     Yes      |
+|    Mixtral     |   8x7B   | LLM  |       Yes        |       Yes        |      No      |
+|  QWen1.5-MoE   |  A2.7B   | LLM  |       Yes        |        -         |      No      |
+|   QWen2(.5)    |    7B    | LLM  |       Yes        |       Yes        |      No      |
+|   QWen2-MoE    | A14.57B  | LLM  |       Yes        |        -         |      No      |
+| InternVL(v1.5) |  2B-26B  | MLLM |       Yes        |        -         |     Yes      |
+|   InternVL2    |  1B-40B  | MLLM |       Yes        |       Yes        |     Yes      |
+|  CogVLM2-chat  |   19B    | MLLM |       Yes        |        No        |      -       |
+|     GLM4V      |    9B    | MLLM |       Yes        |        No        |      -       |

From 178ec7bddcba23d32d9cdc2488e045624390086c Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
Date: Tue, 19 Nov 2024 18:04:39 +0800
Subject: [PATCH 18/40] [CI] Split vl testcases into turbomind and pytorch
 backend (#2751)

* updaet

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update
---
 .github/scripts/eval_base_config.py           |  11 ++
 .github/scripts/eval_chat_config.py           |  21 ++
 .github/workflows/daily_ete_test.yml          |  47 +++--
 .github/workflows/daily_ete_test_v100.yml     |  47 +++--
 autotest/config-v100.yaml                     |  18 +-
 autotest/config.yaml                          |  55 ++++--
 ...h.py => test_pipeline_chat_pytorch_llm.py} |   0
 .../test_pipeline_chat_pytorch_mllm.py        | 120 ++++++++++++
 ...py => test_pipeline_chat_turbomind_llm.py} |   0
 .../test_pipeline_chat_turbomind_mllm.py      | 139 +++++++++++++
 .../test_pipeline_chat_turbomind_vl.py        | 109 -----------
 ...py => test_restful_chat_hf_pytorch_llm.py} |   0
 .../test_restful_chat_hf_pytorch_mllm.py      | 116 +++++++++++
 ... => test_restful_chat_hf_turbomind_llm.py} |   0
 .../test_restful_chat_hf_turbomind_mllm.py    | 116 +++++++++++
 .../test_restful_chat_hf_turbomind_vl.py      | 182 ------------------
 autotest/utils/benchmark_utils.py             |   9 +-
 autotest/utils/config_utils.py                |   2 +-
 autotest/utils/pipeline_chat.py               |  44 +++--
 autotest/utils/run_restful_chat.py            |  50 +++++
 docs/en/supported_models/supported_models.md  |  66 +++----
 .../supported_models/supported_models.md      |  66 +++----
 22 files changed, 786 insertions(+), 432 deletions(-)
 rename autotest/tools/pipeline/{test_pipeline_chat_pytorch.py => test_pipeline_chat_pytorch_llm.py} (100%)
 create mode 100644 autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py
 rename autotest/tools/pipeline/{test_pipeline_chat_turbomind.py => test_pipeline_chat_turbomind_llm.py} (100%)
 create mode 100644 autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
 delete mode 100644 autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py
 rename autotest/tools/restful/{test_restful_chat_hf_pytorch.py => test_restful_chat_hf_pytorch_llm.py} (100%)
 create mode 100644 autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py
 rename autotest/tools/restful/{test_restful_chat_hf_turbomind.py => test_restful_chat_hf_turbomind_llm.py} (100%)
 create mode 100644 autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py
 delete mode 100644 autotest/tools/restful/test_restful_chat_hf_turbomind_vl.py

diff --git a/.github/scripts/eval_base_config.py b/.github/scripts/eval_base_config.py
index 8915decc7c..9a2b5fc39e 100644
--- a/.github/scripts/eval_base_config.py
+++ b/.github/scripts/eval_base_config.py
@@ -89,6 +89,17 @@
         models as lmdeploy_qwen1_5_7b  # noqa: F401, E501
     from opencompass.configs.models.qwen.lmdeploy_qwen2_7b import \
         models as lmdeploy_qwen2_7b  # noqa: F401, E501
+    # Summary Groups
+    from opencompass.configs.summarizers.groups.cmmlu import \
+        cmmlu_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.GaokaoBench import \
+        GaokaoBench_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
+        mathbench_2024_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.mmlu import \
+        mmlu_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.mmlu_pro import \
+        mmlu_pro_summary_groups  # noqa: F401, E501
 
     # read models
 race_datasets = [race_datasets[1]]
diff --git a/.github/scripts/eval_chat_config.py b/.github/scripts/eval_chat_config.py
index a54b66bdc8..e2463c0f39 100644
--- a/.github/scripts/eval_chat_config.py
+++ b/.github/scripts/eval_chat_config.py
@@ -98,6 +98,27 @@
         models as lmdeploy_qwen2_7b_instruct  # noqa: F401, E501
     from opencompass.configs.models.qwen.lmdeploy_qwen_7b_chat import \
         models as lmdeploy_qwen_7b_chat  # noqa: F401, E501
+    # Summary Groups
+    from opencompass.configs.summarizers.groups.bbh import \
+        bbh_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.cmmlu import \
+        cmmlu_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.ds1000 import \
+        ds1000_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.GaokaoBench import \
+        GaokaoBench_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.humanevalx import \
+        humanevalx_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
+        mathbench_2024_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.mmlu import \
+        mmlu_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.mmlu_pro import \
+        mmlu_pro_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.scicode import \
+        scicode_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.teval import \
+        teval_summary_groups  # noqa: F401, E501
 
 llama2_meta_template = dict(round=[
     dict(role='HUMAN', begin='[INST] ', end=' [/INST]'),
diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml
index ab01d692c0..dbacfc32f5 100644
--- a/.github/workflows/daily_ete_test.yml
+++ b/.github/workflows/daily_ete_test.yml
@@ -17,10 +17,15 @@ on:
         required: true
         description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
         type: string
-        default: '["turbomind", "pytorch", "turbomind_vl"]'
+        default: "['turbomind', 'pytorch']"
       model:
         required: true
-        description: 'Set testcase module filter: chat, restful, pipeline, quantization. Default contains all models'
+        description: 'Set testcase module filter: llm, vllm. Default contains all models'
+        type: string
+        default: "['llm','mllm']"
+      function:
+        required: true
+        description: 'Set testcase function filter: chat, restful, pipeline. Default contains all functions'
         type: string
         default: '["pipeline", "restful", "chat"]'
       offline_mode:
@@ -206,14 +211,20 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch", "turbomind_vl"]')}}
-        model: ${{ fromJSON(inputs.model || '["pipeline", "restful", "chat"]')}}
+        backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}}
+        model: ${{ fromJSON(inputs.model || '["llm", "mllm"]')}}
+        function: ${{ fromJSON(inputs.function || '["pipeline","restful","chat"]')}}
         exclude:
-          - backend: turbomind_vl
-            model: chat
+          - backend: turbomind
+            model: mllm
+            function: chat
+          - backend: pytorch
+            model: mllm
+            function: chat
         include:
           - backend: turbomind
-            model: local_case
+            model: llm
+            function: local_case
     env:
       PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA
       MODELSCOPE_CACHE: /root/modelscope_hub
@@ -261,7 +272,7 @@ jobs:
           ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Test lmdeploy - chat workspace
         continue-on-error: true
-        if: matrix.backend == 'turbomind' && matrix.model == 'chat'
+        if: matrix.backend == 'turbomind' && matrix.model == 'llm' && matrix.function == 'chat'
         run: |
           pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
@@ -269,7 +280,7 @@ jobs:
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - chat
         continue-on-error: true
-        if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'chat'
+        if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat'
         run: |
           pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
@@ -277,30 +288,30 @@ jobs:
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - pipeline
         continue-on-error: true
-        if: matrix.model == 'pipeline'
+        if: matrix.function == 'pipeline'
         run: |
-          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - restful
         continue-on-error: true
-        if: matrix.model == 'restful'
+        if: matrix.function == 'restful'
         run: |
-          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - restful workspace
         continue-on-error: true
-        if: matrix.backend == 'turbomind' && matrix.model == 'restful'
+        if: matrix.backend == 'turbomind' && matrix.model == 'llm' && matrix.function == 'restful'
         run: |
           pytest autotest/tools/restful/test_restful_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
           pytest autotest/tools/restful/test_restful_chat_workspace.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - local testcase
-        if: matrix.backend == 'turbomind' && matrix.model == 'local_case'
+        if: matrix.backend == 'turbomind' && matrix.model == 'llm' && matrix.function == 'local_case'
         run: |
           pytest /local_case/issue_regression --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}}|| true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
@@ -321,7 +332,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        backend: ['turbomind', 'pytorch']
+        backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}}
     timeout-minutes: 60
     container:
       image: openmmlab/lmdeploy:latest-cu11
diff --git a/.github/workflows/daily_ete_test_v100.yml b/.github/workflows/daily_ete_test_v100.yml
index 0112e9aaab..8a662b85f5 100644
--- a/.github/workflows/daily_ete_test_v100.yml
+++ b/.github/workflows/daily_ete_test_v100.yml
@@ -17,10 +17,15 @@ on:
         required: true
         description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
         type: string
-        default: '["turbomind", "pytorch", "turbomind_vl"]'
+        default: "['turbomind', 'pytorch']"
       model:
         required: true
-        description: 'Set testcase module filter: chat, restful, pipeline, quantization. Default contains all models'
+        description: 'Set testcase module filter: llm, vllm. Default contains all models'
+        type: string
+        default: "['llm','mllm']"
+      function:
+        required: true
+        description: 'Set testcase function filter: chat, restful, pipeline. Default contains all functions'
         type: string
         default: '["pipeline", "restful", "chat"]'
       offline_mode:
@@ -201,14 +206,20 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch", "turbomind_vl"]')}}
-        model: ${{ fromJSON(inputs.model || '["pipeline", "restful", "chat"]')}}
+        backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}}
+        model: ${{ fromJSON(inputs.model || '["llm", "mllm"]')}}
+        function: ${{ fromJSON(inputs.function || '["pipeline","restful","chat"]')}}
         exclude:
-          - backend: turbomind_vl
-            model: chat
+          - backend: turbomind
+            model: mllm
+            function: chat
+          - backend: pytorch
+            model: mllm
+            function: chat
         include:
           - backend: turbomind
-            model: local_case
+            model: llm
+            function: local_case
     env:
       PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA
       MODELSCOPE_CACHE: /root/modelscope_hub
@@ -255,7 +266,7 @@ jobs:
           ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Test lmdeploy - chat workspace
         continue-on-error: true
-        if: matrix.backend == 'turbomind' && matrix.model == 'chat'
+        if: matrix.backend == 'turbomind' && matrix.model == 'llm' && matrix.function == 'chat'
         run: |
           pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
@@ -263,7 +274,7 @@ jobs:
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - chat
         continue-on-error: true
-        if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'chat'
+        if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat'
         run: |
           pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
@@ -271,30 +282,30 @@ jobs:
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - pipeline
         continue-on-error: true
-        if: matrix.model == 'pipeline'
+        if: matrix.function == 'pipeline'
         run: |
-          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - restful
         continue-on-error: true
-        if: matrix.model == 'restful'
+        if: matrix.function == 'restful'
         run: |
-          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - restful workspace
         continue-on-error: true
-        if: matrix.backend == 'turbomind' && matrix.model == 'restful'
+        if: matrix.backend == 'turbomind' && matrix.model == 'llm' && matrix.function == 'restful'
         run: |
           pytest autotest/tools/restful/test_restful_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
           pytest autotest/tools/restful/test_restful_chat_workspace.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - local testcase
-        if: matrix.backend == 'turbomind' && matrix.model == 'local_case'
+        if: matrix.backend == 'turbomind' && matrix.model == 'llm' && matrix.function == 'local_case'
         run: |
           pytest /local_case/issue_regression --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}}|| true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
@@ -315,7 +326,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        backend: ['turbomind', 'pytorch']
+        backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}}
     timeout-minutes: 120
     container:
       image: openmmlab/lmdeploy:latest-cu12
diff --git a/autotest/config-v100.yaml b/autotest/config-v100.yaml
index de51e7e5e7..41216cb730 100644
--- a/autotest/config-v100.yaml
+++ b/autotest/config-v100.yaml
@@ -22,6 +22,7 @@ tp_config:
 
 turbomind_chat_model:
     - meta-llama/Llama-3.2-1B-Instruct
+    - meta-llama/Llama-3.2-3B-Instruct
     - meta-llama/Meta-Llama-3-1-8B-Instruct
     - meta-llama/Meta-Llama-3-1-8B-Instruct-AWQ
     - meta-llama/Meta-Llama-3-8B-Instruct
@@ -48,6 +49,7 @@ pytorch_chat_model:
     - meta-llama/Meta-Llama-3-1-8B-Instruct
     - internlm/internlm2_5-7b-chat
     - internlm/internlm2_5-20b-chat
+    - OpenGVLab/InternVL2-1B
     - OpenGVLab/InternVL2-2B
     - OpenGVLab/InternVL2-4B
     - OpenGVLab/InternVL2-8B
@@ -71,16 +73,25 @@ pytorch_base_model:
     - internlm/internlm2_5-7b
     - internlm/internlm2_5-20b
 
-vl_model:
+turbomind_vl_model:
     - OpenGVLab/InternVL2-1B
     - OpenGVLab/InternVL2-2B
-    - OpenGVLab/InternVL2-4B
     - OpenGVLab/InternVL2-8B
     - OpenGVLab/InternVL2-26B
     - Qwen/Qwen2-VL-2B-Instruct
     - Qwen/Qwen2-VL-7B-Instruct
     - internlm/internlm-xcomposer2d5-7b
     - THUDM/glm-4v-9b
+
+pytorch_vl_model:
+    - OpenGVLab/InternVL2-1B
+    - OpenGVLab/InternVL2-4B
+    - OpenGVLab/InternVL2-8B
+    - OpenGVLab/InternVL2-26B
+    - OpenGVLab/Mono-InternVL-2B
+    - Qwen/Qwen2-VL-2B-Instruct
+    - Qwen/Qwen2-VL-7B-Instruct
+    - THUDM/glm-4v-9b
     - microsoft/Phi-3.5-vision-instruct
 
 turbomind_quatization:
@@ -107,10 +118,13 @@ pytorch_quatization:
         - internlm/internlm2_5-7b-chat
         - internlm/internlm2_5-7b
     no_kvint4:
+        - OpenGVLab/InternVL2-1B
         - OpenGVLab/InternVL2-4B
         - deepseek-ai/DeepSeek-V2-Lite-Chat
         - microsoft/Phi-3-mini-4k-instruct
         - microsoft/Phi-3-vision-128k-instruct
+        - microsoft/Phi-3.5-vision-instruct
+        - openbmb/MiniCPM-V-2_6
     no_kvint8:
         - deepseek-ai/DeepSeek-V2-Lite-Chat
 
diff --git a/autotest/config.yaml b/autotest/config.yaml
index 587ee6331b..6c92d2cf0b 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -21,6 +21,7 @@ tp_config:
 
 turbomind_chat_model:
     - meta-llama/Llama-3.2-1B-Instruct
+    - meta-llama/Llama-3.2-3B-Instruct
     - meta-llama/Meta-Llama-3-1-8B-Instruct
     - meta-llama/Meta-Llama-3-1-8B-Instruct-AWQ
     - meta-llama/Meta-Llama-3-8B-Instruct
@@ -51,6 +52,7 @@ turbomind_chat_model:
     - Qwen/Qwen2-7B-Instruct-GPTQ-Int4
     - Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4
     - mistralai/Mistral-7B-Instruct-v0.3
+    - mistralai/Mistral-Nemo-Instruct-2407
     - mistralai/Mixtral-8x7B-Instruct-v0.1
     - lmdeploy/llama2-chat-7b-w4
     - baichuan-inc/Baichuan2-7B-Chat
@@ -69,11 +71,14 @@ pytorch_chat_model:
     - meta-llama/Meta-Llama-3-8B-Instruct
     - meta-llama/Meta-Llama-3-1-8B-Instruct
     - meta-llama/Llama-3.2-1B-Instruct
+    - meta-llama/Llama-3.2-3B-Instruct
+    - meta-llama/Llama-3.2-11B-Vision-Instruct
     - meta-llama/Llama-2-7b-chat-hf
     - internlm/internlm2_5-7b-chat
     - internlm/internlm2_5-20b-chat
     - internlm/internlm2-chat-20b
     - internlm/internlm-chat-20b
+    - OpenGVLab/InternVL2-1B
     - OpenGVLab/InternVL2-2B
     - OpenGVLab/InternVL2-4B
     - OpenGVLab/InternVL2-8B
@@ -106,20 +111,7 @@ pytorch_chat_model:
     - microsoft/Phi-3-mini-4k-instruct
     - microsoft/Phi-3-vision-128k-instruct
 
-turbomind_base_model:
-    - internlm/internlm2_5-7b
-    - internlm/internlm2_5-1_8b
-    - internlm/internlm2_5-20b
-    - codellama/CodeLlama-7b-hf
-
-pytorch_base_model:
-    - tiiuae/falcon-7b
-    - internlm/internlm2_5-7b
-    - internlm/internlm2_5-1_8b
-    - internlm/internlm2_5-20b
-    - bigcode/starcoder2-7b
-
-vl_model:
+turbomind_vl_model:
     - Qwen/Qwen-VL-Chat
     - liuhaotian/llava-v1.5-13b
     - liuhaotian/llava-v1.6-vicuna-7b
@@ -129,6 +121,20 @@ vl_model:
     - OpenGVLab/Mini-InternVL-Chat-2B-V1-5
     - OpenGVLab/InternVL2-1B
     - OpenGVLab/InternVL2-2B
+    - OpenGVLab/InternVL2-8B
+    - OpenGVLab/InternVL2-26B
+    - OpenGVLab/InternVL2-40B
+    - internlm/internlm-xcomposer2d5-7b
+    - internlm/internlm-xcomposer2-4khd-7b
+    - openbmb/MiniCPM-Llama3-V-2_5
+    - openbmb/MiniCPM-V-2_6
+
+pytorch_vl_model:
+    - meta-llama/Llama-3.2-11B-Vision-Instruct
+    - OpenGVLab/InternVL-Chat-V1-5
+    - OpenGVLab/Mini-InternVL-Chat-2B-V1-5
+    - OpenGVLab/InternVL2-1B
+    - OpenGVLab/InternVL2-2B
     - OpenGVLab/InternVL2-4B
     - OpenGVLab/InternVL2-8B
     - OpenGVLab/InternVL2-26B
@@ -136,15 +142,24 @@ vl_model:
     - OpenGVLab/Mono-InternVL-2B
     - Qwen/Qwen2-VL-2B-Instruct
     - Qwen/Qwen2-VL-7B-Instruct
-    - internlm/internlm-xcomposer2d5-7b
-    - internlm/internlm-xcomposer2-4khd-7b
     - THUDM/cogvlm-chat-hf
     - THUDM/cogvlm2-llama3-chinese-chat-19B
     - THUDM/glm-4v-9b
-    - microsoft/Phi-3.5-vision-instruct
     - microsoft/Phi-3-vision-128k-instruct
-    - openbmb/MiniCPM-Llama3-V-2_5
-    - openbmb/MiniCPM-V-2_6
+    - microsoft/Phi-3.5-vision-instruct
+
+turbomind_base_model:
+    - internlm/internlm2_5-7b
+    - internlm/internlm2_5-1_8b
+    - internlm/internlm2_5-20b
+    - codellama/CodeLlama-7b-hf
+
+pytorch_base_model:
+    - tiiuae/falcon-7b
+    - internlm/internlm2_5-7b
+    - internlm/internlm2_5-1_8b
+    - internlm/internlm2_5-20b
+    - bigcode/starcoder2-7b
 
 turbomind_quatization:
     no_awq:
@@ -184,10 +199,12 @@ pytorch_quatization:
         - internlm/internlm2_5-20b
         - internlm/internlm2_5-7b
     no_kvint4:
+        - OpenGVLab/InternVL2-1B
         - OpenGVLab/InternVL2-4B
         - deepseek-ai/DeepSeek-V2-Lite-Chat
         - microsoft/Phi-3-mini-4k-instruct
         - microsoft/Phi-3-vision-128k-instruct
+        - microsoft/Phi-3.5-vision-instruct
         - openbmb/MiniCPM-V-2_6
     no_kvint8:
         - deepseek-ai/DeepSeek-V2-Lite-Chat
diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py
similarity index 100%
rename from autotest/tools/pipeline/test_pipeline_chat_pytorch.py
rename to autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py
diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py
new file mode 100644
index 0000000000..276ced5bcb
--- /dev/null
+++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py
@@ -0,0 +1,120 @@
+import os
+from multiprocessing import get_context
+
+import pytest
+from utils.config_utils import get_cuda_id_by_workerid, get_torch_model_list
+from utils.pipeline_chat import (assert_pipeline_vl_chat_log,
+                                 run_pipeline_vl_chat_test)
+
+BACKEND = 'pytorch'
+
+
+@pytest.mark.order(6)
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_1
+@pytest.mark.parametrize('model',
+                         get_torch_model_list(tp_num=1, model_type='vl_model'))
+def test_pipeline_chat_tp1(config, model, worker_id):
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_vl_chat_test,
+                              args=(config, model, BACKEND, worker_id))
+    p.start()
+    p.join()
+    assert_pipeline_vl_chat_log(config, model, worker_id)
+
+
+@pytest.mark.order(6)
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_2
+@pytest.mark.parametrize('model',
+                         get_torch_model_list(tp_num=2, model_type='vl_model'))
+def test_pipeline_chat_tp2(config, model, worker_id):
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id,
+                                                                     tp_num=2)
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_vl_chat_test,
+                              args=(config, model, BACKEND, worker_id))
+    p.start()
+    p.join()
+    assert_pipeline_vl_chat_log(config, model, worker_id)
+
+
+@pytest.mark.order(6)
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_1
+@pytest.mark.parametrize('model',
+                         get_torch_model_list(tp_num=1,
+                                              quant_policy=4,
+                                              model_type='vl_model'))
+def test_pipeline_chat_kvint4_tp1(config, model, worker_id):
+    if 'Qwen2' in model:
+        return  # kvint4 for qwen2 is not support
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_vl_chat_test,
+                              args=(config, model, BACKEND, worker_id, 4))
+    p.start()
+    p.join()
+    assert_pipeline_vl_chat_log(config, model, worker_id)
+
+
+@pytest.mark.order(6)
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_2
+@pytest.mark.parametrize('model',
+                         get_torch_model_list(tp_num=2,
+                                              quant_policy=4,
+                                              model_type='vl_model'))
+def test_pipeline_chat_kvint4_tp2(config, model, worker_id):
+    if 'Qwen2' in model:
+        return  # kvint4 for qwen2 is not support
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id,
+                                                                     tp_num=2)
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_vl_chat_test,
+                              args=(config, model, BACKEND, worker_id, 4))
+    p.start()
+    p.join()
+    assert_pipeline_vl_chat_log(config, model, worker_id)
+
+
+@pytest.mark.order(6)
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_1
+@pytest.mark.parametrize('model',
+                         get_torch_model_list(tp_num=1,
+                                              quant_policy=8,
+                                              model_type='vl_model'))
+def test_pipeline_chat_kvint8_tp1(config, model, worker_id):
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_vl_chat_test,
+                              args=(config, model, BACKEND, worker_id, 8))
+    p.start()
+    p.join()
+    assert_pipeline_vl_chat_log(config, model, worker_id)
+
+
+@pytest.mark.order(6)
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_2
+@pytest.mark.parametrize('model',
+                         get_torch_model_list(tp_num=2,
+                                              quant_policy=8,
+                                              model_type='vl_model'))
+def test_pipeline_chat_kvint8_tp2(config, model, worker_id):
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id,
+                                                                     tp_num=2)
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_vl_chat_test,
+                              args=(config, model, BACKEND, worker_id, 8))
+    p.start()
+    p.join()
+    assert_pipeline_vl_chat_log(config, model, worker_id)
diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py
similarity index 100%
rename from autotest/tools/pipeline/test_pipeline_chat_turbomind.py
rename to autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py
diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
new file mode 100644
index 0000000000..8f1bc7d8b1
--- /dev/null
+++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
@@ -0,0 +1,139 @@
+import os
+from multiprocessing import get_context
+
+import pytest
+from utils.config_utils import get_all_model_list, get_cuda_id_by_workerid
+from utils.pipeline_chat import (assert_pipeline_vl_chat_log,
+                                 run_pipeline_vl_chat_test)
+
+BACKEND = 'turbomind'
+
+
+@pytest.mark.order(6)
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_1
+@pytest.mark.parametrize('model',
+                         get_all_model_list(tp_num=1, model_type='vl_model'))
+def test_pipeline_chat_tp1(config, model, worker_id):
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_vl_chat_test,
+                              args=(config, model, BACKEND, worker_id))
+    p.start()
+    p.join()
+    assert_pipeline_vl_chat_log(config, model, worker_id)
+
+
+@pytest.mark.order(6)
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_2
+@pytest.mark.parametrize('model',
+                         get_all_model_list(tp_num=2, model_type='vl_model'))
+def test_pipeline_chat_tp2(config, model, worker_id):
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id,
+                                                                     tp_num=2)
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_vl_chat_test,
+                              args=(config, model, BACKEND, worker_id))
+    p.start()
+    p.join()
+    assert_pipeline_vl_chat_log(config, model, worker_id)
+
+
+@pytest.mark.order(6)
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_1
+@pytest.mark.parametrize('model',
+                         get_all_model_list(tp_num=1,
+                                            quant_policy=4,
+                                            model_type='vl_model'))
+def test_pipeline_chat_kvint4_tp1(config, model, worker_id):
+    if 'Qwen2' in model:
+        return  # kvint4 for qwen2 is not support
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_vl_chat_test,
+                              args=(config, model, BACKEND, worker_id, 4))
+    p.start()
+    p.join()
+    assert_pipeline_vl_chat_log(config, model, worker_id)
+
+
+@pytest.mark.order(6)
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_2
+@pytest.mark.parametrize('model',
+                         get_all_model_list(tp_num=2,
+                                            quant_policy=4,
+                                            model_type='vl_model'))
+def test_pipeline_chat_kvint4_tp2(config, model, worker_id):
+    if 'Qwen2' in model:
+        return  # kvint4 for qwen2 is not support
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id,
+                                                                     tp_num=2)
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_vl_chat_test,
+                              args=(config, model, BACKEND, worker_id, 4))
+    p.start()
+    p.join()
+    assert_pipeline_vl_chat_log(config, model, worker_id)
+
+
+@pytest.mark.order(6)
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_1
+@pytest.mark.parametrize('model',
+                         get_all_model_list(tp_num=1,
+                                            quant_policy=8,
+                                            model_type='vl_model'))
+def test_pipeline_chat_kvint8_tp1(config, model, worker_id):
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_vl_chat_test,
+                              args=(config, model, BACKEND, worker_id, 8))
+    p.start()
+    p.join()
+    assert_pipeline_vl_chat_log(config, model, worker_id)
+
+
+@pytest.mark.order(6)
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_2
+@pytest.mark.parametrize('model',
+                         get_all_model_list(tp_num=2,
+                                            quant_policy=8,
+                                            model_type='vl_model'))
+def test_pipeline_chat_kvint8_tp2(config, model, worker_id):
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id,
+                                                                     tp_num=2)
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_vl_chat_test,
+                              args=(config, model, BACKEND, worker_id, 8))
+    p.start()
+    p.join()
+    assert_pipeline_vl_chat_log(config, model, worker_id)
+
+
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_1
+@pytest.mark.pr_test
+@pytest.mark.parametrize('model', [
+    'liuhaotian/llava-v1.6-vicuna-7b', 'OpenGVLab/InternVL2-4B',
+    'OpenGVLab/InternVL2-8B', 'internlm/internlm-xcomposer2d5-7b'
+])
+def test_pipeline_pr_test(config, model, worker_id):
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = str(
+            int(get_cuda_id_by_workerid(worker_id)) + 5)
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_vl_chat_test,
+                              args=(config, model, BACKEND, worker_id))
+    p.start()
+    p.join()
+    assert_pipeline_vl_chat_log(config, model, worker_id)
diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py
deleted file mode 100644
index 3279495493..0000000000
--- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py
+++ /dev/null
@@ -1,109 +0,0 @@
-import os
-from multiprocessing import Process
-
-import pytest
-from utils.config_utils import get_cuda_id_by_workerid, get_vl_model_list
-from utils.pipeline_chat import (assert_pipeline_vl_chat_log,
-                                 run_pipeline_vl_chat_test)
-
-
-@pytest.mark.order(6)
-@pytest.mark.pipeline_chat
-@pytest.mark.gpu_num_1
-@pytest.mark.parametrize('model', get_vl_model_list(tp_num=1))
-def test_pipeline_chat_tp1(config, model, worker_id):
-    if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
-    p = Process(target=run_pipeline_vl_chat_test, args=(config, model))
-    p.start()
-    p.join()
-    assert_pipeline_vl_chat_log(config, model)
-
-
-@pytest.mark.order(6)
-@pytest.mark.pipeline_chat
-@pytest.mark.gpu_num_2
-@pytest.mark.parametrize('model', get_vl_model_list(tp_num=2))
-def test_pipeline_chat_tp2(config, model, worker_id):
-    if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id,
-                                                                     tp_num=2)
-    p = Process(target=run_pipeline_vl_chat_test, args=(config, model))
-    p.start()
-    p.join()
-    assert_pipeline_vl_chat_log(config, model)
-
-
-@pytest.mark.order(6)
-@pytest.mark.pipeline_chat
-@pytest.mark.gpu_num_1
-@pytest.mark.parametrize('model', get_vl_model_list(tp_num=1, quant_policy=4))
-def test_pipeline_chat_kvint4_tp1(config, model, worker_id):
-    if 'Qwen2' in model:
-        return  # kvint4 for qwen2 is not support
-    if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
-    p = Process(target=run_pipeline_vl_chat_test, args=(config, model, 4))
-    p.start()
-    p.join()
-    assert_pipeline_vl_chat_log(config, model)
-
-
-@pytest.mark.order(6)
-@pytest.mark.pipeline_chat
-@pytest.mark.gpu_num_2
-@pytest.mark.parametrize('model', get_vl_model_list(tp_num=2, quant_policy=4))
-def test_pipeline_chat_kvint4_tp2(config, model, worker_id):
-    if 'Qwen2' in model:
-        return  # kvint4 for qwen2 is not support
-    if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id,
-                                                                     tp_num=2)
-    p = Process(target=run_pipeline_vl_chat_test, args=(config, model, 4))
-    p.start()
-    p.join()
-    assert_pipeline_vl_chat_log(config, model)
-
-
-@pytest.mark.order(6)
-@pytest.mark.pipeline_chat
-@pytest.mark.gpu_num_1
-@pytest.mark.parametrize('model', get_vl_model_list(tp_num=1, quant_policy=8))
-def test_pipeline_chat_kvint8_tp1(config, model, worker_id):
-    if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
-    p = Process(target=run_pipeline_vl_chat_test, args=(config, model, 8))
-    p.start()
-    p.join()
-    assert_pipeline_vl_chat_log(config, model)
-
-
-@pytest.mark.order(6)
-@pytest.mark.pipeline_chat
-@pytest.mark.gpu_num_2
-@pytest.mark.parametrize('model', get_vl_model_list(tp_num=2, quant_policy=8))
-def test_pipeline_chat_kvint8_tp2(config, model, worker_id):
-    if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id,
-                                                                     tp_num=2)
-    p = Process(target=run_pipeline_vl_chat_test, args=(config, model, 8))
-    p.start()
-    p.join()
-    assert_pipeline_vl_chat_log(config, model)
-
-
-@pytest.mark.pipeline_chat
-@pytest.mark.gpu_num_1
-@pytest.mark.pr_test
-@pytest.mark.parametrize('model', [
-    'liuhaotian/llava-v1.6-vicuna-7b', 'OpenGVLab/InternVL2-4B',
-    'OpenGVLab/InternVL2-8B', 'internlm/internlm-xcomposer2d5-7b'
-])
-def test_pipeline_pr_test(config, model, worker_id):
-    if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = str(
-            int(get_cuda_id_by_workerid(worker_id)) + 5)
-    p = Process(target=run_pipeline_vl_chat_test, args=(config, model))
-    p.start()
-    p.join()
-    assert_pipeline_vl_chat_log(config, model)
diff --git a/autotest/tools/restful/test_restful_chat_hf_pytorch.py b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py
similarity index 100%
rename from autotest/tools/restful/test_restful_chat_hf_pytorch.py
rename to autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py
diff --git a/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py b/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py
new file mode 100644
index 0000000000..b210733db4
--- /dev/null
+++ b/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py
@@ -0,0 +1,116 @@
+import pytest
+from utils.config_utils import get_torch_model_list, get_workerid
+from utils.run_restful_chat import (run_vl_testcase, start_restful_api,
+                                    stop_restful_api)
+
+BASE_HTTP_URL = 'http://localhost'
+DEFAULT_PORT = 23333
+
+
+@pytest.fixture(scope='function', autouse=True)
+def prepare_environment(request, config, worker_id):
+    param = request.param
+    model = param['model']
+    model_path = config.get('model_path') + '/' + model
+
+    pid, startRes = start_restful_api(config, param, model, model_path,
+                                      'pytorch', worker_id)
+    yield
+    stop_restful_api(pid, startRes, param)
+
+
+def getModelList(tp_num):
+    return [{
+        'model': item,
+        'cuda_prefix': None,
+        'tp_num': tp_num,
+    } for item in get_torch_model_list(tp_num, model_type='vl_model')]
+
+
+@pytest.mark.order(7)
+@pytest.mark.restful_api_vl
+@pytest.mark.gpu_num_1
+@pytest.mark.parametrize('prepare_environment',
+                         getModelList(tp_num=1),
+                         indirect=True)
+def test_restful_chat_tp1(config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_vl_testcase(config)
+    else:
+        run_vl_testcase(config, port=DEFAULT_PORT + get_workerid(worker_id))
+
+
+@pytest.mark.order(7)
+@pytest.mark.restful_api_vl
+@pytest.mark.gpu_num_2
+@pytest.mark.parametrize('prepare_environment',
+                         getModelList(tp_num=2),
+                         indirect=True)
+def test_restful_chat_tp2(config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_vl_testcase(config)
+    else:
+        run_vl_testcase(config, port=DEFAULT_PORT + get_workerid(worker_id))
+
+
+def getKvintModelList(tp_num, quant_policy: int = None):
+    return [{
+        'model': item,
+        'cuda_prefix': None,
+        'tp_num': tp_num,
+        'extra': f'--quant-policy {quant_policy}'
+    } for item in get_torch_model_list(
+        tp_num, quant_policy=quant_policy, model_type='vl_model')
+            if 'qwen2' not in item.lower() or quant_policy == 8]
+
+
+@pytest.mark.order(7)
+@pytest.mark.restful_api_vl
+@pytest.mark.gpu_num_1
+@pytest.mark.parametrize('prepare_environment',
+                         getKvintModelList(tp_num=1, quant_policy=4),
+                         indirect=True)
+def test_restful_chat_kvint4_tp1(config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_vl_testcase(config)
+    else:
+        run_vl_testcase(config, port=DEFAULT_PORT + get_workerid(worker_id))
+
+
+@pytest.mark.order(7)
+@pytest.mark.restful_api_vl
+@pytest.mark.gpu_num_2
+@pytest.mark.parametrize('prepare_environment',
+                         getKvintModelList(tp_num=2, quant_policy=4),
+                         indirect=True)
+def test_restful_chat_kvint4_tp2(config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_vl_testcase(config)
+    else:
+        run_vl_testcase(config, port=DEFAULT_PORT + get_workerid(worker_id))
+
+
+@pytest.mark.order(7)
+@pytest.mark.restful_api_vl
+@pytest.mark.gpu_num_1
+@pytest.mark.parametrize('prepare_environment',
+                         getKvintModelList(tp_num=1, quant_policy=8),
+                         indirect=True)
+def test_restful_chat_kvint8_tp1(config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_vl_testcase(config)
+    else:
+        run_vl_testcase(config, port=DEFAULT_PORT + get_workerid(worker_id))
+
+
+@pytest.mark.order(7)
+@pytest.mark.restful_api_vl
+@pytest.mark.gpu_num_2
+@pytest.mark.parametrize('prepare_environment',
+                         getKvintModelList(tp_num=2, quant_policy=8),
+                         indirect=True)
+def test_restful_chat_kvint8_tp2(config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_vl_testcase(config)
+    else:
+        run_vl_testcase(config, port=DEFAULT_PORT + get_workerid(worker_id))
diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py
similarity index 100%
rename from autotest/tools/restful/test_restful_chat_hf_turbomind.py
rename to autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py
diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py
new file mode 100644
index 0000000000..091e18e6e3
--- /dev/null
+++ b/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py
@@ -0,0 +1,116 @@
+import pytest
+from utils.config_utils import get_all_model_list, get_workerid
+from utils.run_restful_chat import (run_vl_testcase, start_restful_api,
+                                    stop_restful_api)
+
+BASE_HTTP_URL = 'http://localhost'
+DEFAULT_PORT = 23333
+
+
+@pytest.fixture(scope='function', autouse=True)
+def prepare_environment(request, config, worker_id):
+    param = request.param
+    model = param['model']
+    model_path = config.get('model_path') + '/' + model
+
+    pid, startRes = start_restful_api(config, param, model, model_path,
+                                      'turbomind', worker_id)
+    yield
+    stop_restful_api(pid, startRes, param)
+
+
+def getModelList(tp_num):
+    return [{
+        'model': item,
+        'cuda_prefix': None,
+        'tp_num': tp_num,
+    } for item in get_all_model_list(tp_num, model_type='vl_model')]
+
+
+@pytest.mark.order(7)
+@pytest.mark.restful_api_vl
+@pytest.mark.gpu_num_1
+@pytest.mark.parametrize('prepare_environment',
+                         getModelList(tp_num=1),
+                         indirect=True)
+def test_restful_chat_tp1(config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_vl_testcase(config)
+    else:
+        run_vl_testcase(config, port=DEFAULT_PORT + get_workerid(worker_id))
+
+
+@pytest.mark.order(7)
+@pytest.mark.restful_api_vl
+@pytest.mark.gpu_num_2
+@pytest.mark.parametrize('prepare_environment',
+                         getModelList(tp_num=2),
+                         indirect=True)
+def test_restful_chat_tp2(config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_vl_testcase(config)
+    else:
+        run_vl_testcase(config, port=DEFAULT_PORT + get_workerid(worker_id))
+
+
+def getKvintModelList(tp_num, quant_policy: int = None):
+    return [{
+        'model': item,
+        'cuda_prefix': None,
+        'tp_num': tp_num,
+        'extra': f'--quant-policy {quant_policy}'
+    } for item in get_all_model_list(
+        tp_num, quant_policy=quant_policy, model_type='vl_model')
+            if 'qwen2' not in item.lower() or quant_policy == 8]
+
+
+@pytest.mark.order(7)
+@pytest.mark.restful_api_vl
+@pytest.mark.gpu_num_1
+@pytest.mark.parametrize('prepare_environment',
+                         getKvintModelList(tp_num=1, quant_policy=4),
+                         indirect=True)
+def test_restful_chat_kvint4_tp1(config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_vl_testcase(config)
+    else:
+        run_vl_testcase(config, port=DEFAULT_PORT + get_workerid(worker_id))
+
+
+@pytest.mark.order(7)
+@pytest.mark.restful_api_vl
+@pytest.mark.gpu_num_2
+@pytest.mark.parametrize('prepare_environment',
+                         getKvintModelList(tp_num=2, quant_policy=4),
+                         indirect=True)
+def test_restful_chat_kvint4_tp2(config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_vl_testcase(config)
+    else:
+        run_vl_testcase(config, port=DEFAULT_PORT + get_workerid(worker_id))
+
+
+@pytest.mark.order(7)
+@pytest.mark.restful_api_vl
+@pytest.mark.gpu_num_1
+@pytest.mark.parametrize('prepare_environment',
+                         getKvintModelList(tp_num=1, quant_policy=8),
+                         indirect=True)
+def test_restful_chat_kvint8_tp1(config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_vl_testcase(config)
+    else:
+        run_vl_testcase(config, port=DEFAULT_PORT + get_workerid(worker_id))
+
+
+@pytest.mark.order(7)
+@pytest.mark.restful_api_vl
+@pytest.mark.gpu_num_2
+@pytest.mark.parametrize('prepare_environment',
+                         getKvintModelList(tp_num=2, quant_policy=8),
+                         indirect=True)
+def test_restful_chat_kvint8_tp2(config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_vl_testcase(config)
+    else:
+        run_vl_testcase(config, port=DEFAULT_PORT + get_workerid(worker_id))
diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_vl.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_vl.py
deleted file mode 100644
index 6e636d7ad4..0000000000
--- a/autotest/tools/restful/test_restful_chat_hf_turbomind_vl.py
+++ /dev/null
@@ -1,182 +0,0 @@
-import os
-
-import allure
-import pytest
-from openai import OpenAI
-from utils.config_utils import get_vl_model_list, get_workerid
-from utils.run_restful_chat import start_restful_api, stop_restful_api
-
-from lmdeploy.serve.openai.api_client import APIClient
-
-BASE_HTTP_URL = 'http://localhost'
-DEFAULT_PORT = 23333
-
-
-@pytest.fixture(scope='function', autouse=True)
-def prepare_environment(request, config, worker_id):
-    param = request.param
-    model = param['model']
-    model_path = config.get('model_path') + '/' + model
-
-    pid, startRes = start_restful_api(config, param, model, model_path,
-                                      'turbomind', worker_id)
-    yield
-    stop_restful_api(pid, startRes, param)
-
-
-def getModelList(tp_num):
-    return [{
-        'model': item,
-        'cuda_prefix': None,
-        'tp_num': tp_num
-    } for item in get_vl_model_list(tp_num)]
-
-
-@pytest.mark.order(7)
-@pytest.mark.restful_api_vl
-@pytest.mark.gpu_num_1
-@pytest.mark.parametrize('prepare_environment',
-                         getModelList(tp_num=1),
-                         indirect=True)
-def test_restful_chat_tp1(config, worker_id):
-    if get_workerid(worker_id) is None:
-        run_all_step(config)
-    else:
-        run_all_step(config, port=DEFAULT_PORT + get_workerid(worker_id))
-
-
-@pytest.mark.order(7)
-@pytest.mark.restful_api_vl
-@pytest.mark.gpu_num_2
-@pytest.mark.parametrize('prepare_environment',
-                         getModelList(tp_num=2),
-                         indirect=True)
-def test_restful_chat_tp2(config, worker_id):
-    if get_workerid(worker_id) is None:
-        run_all_step(config)
-    else:
-        run_all_step(config, port=DEFAULT_PORT + get_workerid(worker_id))
-
-
-def getKvintModelList(tp_num, quant_policy: int = None):
-    return [{
-        'model': item,
-        'cuda_prefix': None,
-        'tp_num': tp_num,
-        'extra': f'--quant-policy {quant_policy}'
-    } for item in get_vl_model_list(tp_num, quant_policy)
-            if 'qwen2' not in item.lower() or quant_policy == 8]
-
-
-@pytest.mark.order(7)
-@pytest.mark.restful_api_vl
-@pytest.mark.gpu_num_1
-@pytest.mark.parametrize('prepare_environment',
-                         getKvintModelList(tp_num=1, quant_policy=4),
-                         indirect=True)
-def test_restful_chat_kvint4_tp1(config, worker_id):
-    if get_workerid(worker_id) is None:
-        run_all_step(config)
-    else:
-        run_all_step(config, port=DEFAULT_PORT + get_workerid(worker_id))
-
-
-@pytest.mark.order(7)
-@pytest.mark.restful_api_vl
-@pytest.mark.gpu_num_2
-@pytest.mark.parametrize('prepare_environment',
-                         getKvintModelList(tp_num=2, quant_policy=4),
-                         indirect=True)
-def test_restful_chat_kvint4_tp2(config, worker_id):
-    if get_workerid(worker_id) is None:
-        run_all_step(config)
-    else:
-        run_all_step(config, port=DEFAULT_PORT + get_workerid(worker_id))
-
-
-@pytest.mark.order(7)
-@pytest.mark.restful_api_vl
-@pytest.mark.gpu_num_1
-@pytest.mark.parametrize('prepare_environment',
-                         getKvintModelList(tp_num=1, quant_policy=8),
-                         indirect=True)
-def test_restful_chat_kvint8_tp1(config, worker_id):
-    if get_workerid(worker_id) is None:
-        run_all_step(config)
-    else:
-        run_all_step(config, port=DEFAULT_PORT + get_workerid(worker_id))
-
-
-@pytest.mark.order(7)
-@pytest.mark.restful_api_vl
-@pytest.mark.gpu_num_2
-@pytest.mark.parametrize('prepare_environment',
-                         getKvintModelList(tp_num=2, quant_policy=8),
-                         indirect=True)
-def test_restful_chat_kvint8_tp2(config, worker_id):
-    if get_workerid(worker_id) is None:
-        run_all_step(config)
-    else:
-        run_all_step(config, port=DEFAULT_PORT + get_workerid(worker_id))
-
-
-PIC = 'https://raw.githubusercontent.com/' + \
-    'open-mmlab/mmdeploy/main/tests/data/tiger.jpeg'
-
-
-def run_all_step(config, port: int = DEFAULT_PORT):
-    http_url = BASE_HTTP_URL + ':' + str(port)
-    log_path = config.get('log_path')
-
-    client = OpenAI(api_key='YOUR_API_KEY', base_url=http_url + '/v1')
-    model_name = client.models.list().data[0].id
-
-    restful_log = os.path.join(
-        log_path,
-        'restful_vl_' + model_name.split('/')[-1] + str(port) + '.log')
-    file = open(restful_log, 'w')
-
-    response = client.chat.completions.create(
-        model=model_name,
-        messages=[{
-            'role':
-            'user',
-            'content': [{
-                'type': 'text',
-                'text': 'Describe the image please',
-            }, {
-                'type': 'image_url',
-                'image_url': {
-                    'url': PIC,
-                },
-            }],
-        }],
-        temperature=0.8,
-        top_p=0.8)
-    file.writelines(str(response).lower() + '\n')
-    assert 'tiger' in str(response).lower() or '虎' in str(
-        response).lower(), response
-
-    api_client = APIClient(http_url)
-    model_name = api_client.available_models[0]
-    messages = [{
-        'role':
-        'user',
-        'content': [{
-            'type': 'text',
-            'text': 'Describe the image please',
-        }, {
-            'type': 'image_url',
-            'image_url': {
-                'url': PIC,
-            },
-        }]
-    }]
-    for item in api_client.chat_completions_v1(model=model_name,
-                                               messages=messages):
-        continue
-    file.writelines(str(item) + '\n')
-    assert 'tiger' in str(item).lower() or '虎' in str(item).lower(), item
-
-    allure.attach.file(restful_log,
-                       attachment_type=allure.attachment_type.TEXT)
diff --git a/autotest/utils/benchmark_utils.py b/autotest/utils/benchmark_utils.py
index 4c2e0a2c90..c38568e6f1 100644
--- a/autotest/utils/benchmark_utils.py
+++ b/autotest/utils/benchmark_utils.py
@@ -27,7 +27,8 @@ def generation_test(config,
     model_path = '/'.join([config.get('model_path'), model])
     log_path = config.get('log_path')
     benchmark_log = os.path.join(
-        log_path, 'benchmark_' + model.split('/')[1] + worker_id + '.log')
+        log_path,
+        'benchmark_generation_' + model.split('/')[1] + worker_id + '.log')
     benchmark_path = '/'.join([
         config.get('benchmark_path'), run_id, model,
         f'benchmark-generation-{backend}'
@@ -86,7 +87,8 @@ def throughput_test(config,
     log_path = config.get('log_path')
     dataset_path = config.get('dataset_path')
     benchmark_log = os.path.join(
-        log_path, 'benchmark_' + model.split('/')[1] + worker_id + '.log')
+        log_path,
+        'benchmark_throughput_' + model.split('/')[1] + worker_id + '.log')
     if backend == 'turbomind' and quant_policy != 0:
         benchmark_path = '/'.join([
             config.get('benchmark_path'), run_id, model,
@@ -150,7 +152,8 @@ def restful_test(config,
     log_path = config.get('log_path')
     dataset_path = config.get('dataset_path')
     benchmark_log = os.path.join(
-        log_path, 'benchmark_' + model.split('/')[1] + worker_id + '.log')
+        log_path,
+        'benchmark_restful_' + model.split('/')[1] + worker_id + '.log')
     if backend == 'turbomind' and quant_policy != 0:
         benchmark_path = '/'.join([
             config.get('benchmark_path'), run_id, model,
diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py
index 8aa5f933fb..24b4a3f8cd 100644
--- a/autotest/utils/config_utils.py
+++ b/autotest/utils/config_utils.py
@@ -23,7 +23,7 @@ def get_turbomind_model_list(tp_num: int = None,
 
     quatization_case_config = config.get('turbomind_quatization')
     for key in config.get('turbomind_' + model_type):
-        if key not in quatization_case_config.get(
+        if key in case_list and key not in quatization_case_config.get(
                 'no_awq') and not is_quantization_model(key):
             case_list.append(key + '-inner-4bits')
     for key in quatization_case_config.get('gptq'):
diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py
index e9988f0e39..562a707efe 100644
--- a/autotest/utils/pipeline_chat.py
+++ b/autotest/utils/pipeline_chat.py
@@ -61,7 +61,7 @@ def run_pipeline_chat_test(config,
         ]))
     file = open(config_log, 'w')
     log_string = '\n'.join([
-        'reproduce config info:',
+        'reproduce config info:', 'from lmdeploy import pipeline',
         'from lmdeploy.messages import PytorchEngineConfig',
         'from lmdeploy.messages import TurbomindEngineConfig',
         'engine_config = ' + str(backend_config),
@@ -273,24 +273,29 @@ def assert_pipeline_single_element(output,
     return result
 
 
-PIC1 = 'https://raw.githubusercontent.com/' + \
-    'open-mmlab/mmdeploy/main/tests/data/tiger.jpeg'
-PIC2 = 'https://raw.githubusercontent.com/' + \
-    'open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg'
+PIC1 = 'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg'  # noqa E501
+PIC2 = 'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg'  # noqa E501
 
 
-def run_pipeline_vl_chat_test(config, model_case, quant_policy: int = None):
+def run_pipeline_vl_chat_test(config,
+                              model_case,
+                              backend,
+                              worker_id: str = '',
+                              quant_policy: int = None):
     log_path = config.get('log_path')
     tp = get_tp_num(config, model_case)
     model_path = config.get('model_path')
     hf_path = model_path + '/' + model_case
 
-    if 'llava' in model_case:
-        backend_config = TurbomindEngineConfig(tp=tp,
-                                               session_len=8192,
-                                               model_name='vicuna')
+    if 'pytorch' in backend:
+        backend_config = PytorchEngineConfig(tp=tp, session_len=8192)
+        if not is_bf16_supported():
+            backend_config.dtype = 'float16'
     else:
         backend_config = TurbomindEngineConfig(tp=tp, session_len=8192)
+
+    if 'llava' in model_case:
+        backend_config.model_name = 'vicuna'
     if '4bit' in model_case.lower() or 'awq' in model_case.lower():
         backend_config.model_format = 'awq'
     if quant_policy is not None:
@@ -301,7 +306,8 @@ def run_pipeline_vl_chat_test(config, model_case, quant_policy: int = None):
     pipe = pipeline(hf_path, backend_config=backend_config)
 
     pipeline_chat_log = os.path.join(
-        log_path, 'pipeline_vl_chat_' + model_case.split('/')[1] + '.log')
+        log_path,
+        'pipeline_vl_chat_' + model_case.split('/')[1] + worker_id + '.log')
     file = open(pipeline_chat_log, 'w')
 
     image = load_image(PIC1)
@@ -311,7 +317,16 @@ def run_pipeline_vl_chat_test(config, model_case, quant_policy: int = None):
     else:
         prompt = 'describe this image'
 
-    file.writelines('engineconfig:' + str(backend_config))
+    log_string = '\n'.join([
+        'reproduce config info:', 'from lmdeploy import pipeline',
+        'from lmdeploy.messages import PytorchEngineConfig',
+        'from lmdeploy.messages import TurbomindEngineConfig',
+        'engine_config = ' + str(backend_config),
+        'pipe = pipeline("' + hf_path + '",  backend_config=engine_config)',
+        f'res = pipe(({prompt}, {image}))'
+    ])
+    file.writelines(log_string)
+    print(log_string)
     response = pipe((prompt, image))
     result = 'tiger' in response.text.lower() or '虎' in response.text.lower()
     file.writelines('result:' + str(result) +
@@ -377,11 +392,12 @@ def run_pipeline_vl_chat_test(config, model_case, quant_policy: int = None):
     torch.cuda.empty_cache()
 
 
-def assert_pipeline_vl_chat_log(config, model_case):
+def assert_pipeline_vl_chat_log(config, model_case, worker_id):
     log_path = config.get('log_path')
 
     pipeline_chat_log = os.path.join(
-        log_path, 'pipeline_vl_chat_' + model_case.split('/')[1] + '.log')
+        log_path,
+        'pipeline_vl_chat_' + model_case.split('/')[1] + worker_id + '.log')
 
     allure.attach.file(pipeline_chat_log,
                        attachment_type=allure.attachment_type.TEXT)
diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py
index dfc363b086..77af1975be 100644
--- a/autotest/utils/run_restful_chat.py
+++ b/autotest/utils/run_restful_chat.py
@@ -6,6 +6,7 @@
 
 import allure
 import psutil
+from openai import OpenAI
 from pytest_assume.plugin import assume
 from utils.config_utils import get_cuda_prefix_by_workerid, get_workerid
 from utils.get_run_config import get_command_with_extra
@@ -278,3 +279,52 @@ def get_model(url):
         return model_name.split('/')[-1]
     except Exception:
         return None
+
+
+PIC = 'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg'  # noqa E501
+
+
+def run_vl_testcase(config, port: int = DEFAULT_PORT):
+    http_url = BASE_HTTP_URL + ':' + str(port)
+    log_path = config.get('log_path')
+
+    client = OpenAI(api_key='YOUR_API_KEY', base_url=http_url + '/v1')
+    model_name = client.models.list().data[0].id
+
+    restful_log = os.path.join(
+        log_path,
+        'restful_vl_' + model_name.split('/')[-1] + str(port) + '.log')
+    file = open(restful_log, 'w')
+
+    prompt_messages = [{
+        'role':
+        'user',
+        'content': [{
+            'type': 'text',
+            'text': 'Describe the image please',
+        }, {
+            'type': 'image_url',
+            'image_url': {
+                'url': PIC,
+            },
+        }],
+    }]
+
+    response = client.chat.completions.create(model=model_name,
+                                              messages=prompt_messages,
+                                              temperature=0.8,
+                                              top_p=0.8)
+    file.writelines(str(response).lower() + '\n')
+    assert 'tiger' in str(response).lower() or '虎' in str(
+        response).lower(), response
+
+    api_client = APIClient(http_url)
+    model_name = api_client.available_models[0]
+    for item in api_client.chat_completions_v1(model=model_name,
+                                               messages=prompt_messages):
+        continue
+    file.writelines(str(item) + '\n')
+    assert 'tiger' in str(item).lower() or '虎' in str(item).lower(), item
+
+    allure.attach.file(restful_log,
+                       attachment_type=allure.attachment_type.TEXT)
diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
index 684a4f5109..283ce596f6 100644
--- a/docs/en/supported_models/supported_models.md
+++ b/docs/en/supported_models/supported_models.md
@@ -4,39 +4,39 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 
 ## TurboMind on CUDA Platform
 
-|         Model         |     Size     | Type | FP16/BF16 | KV INT8 | KV INT4 | W4A16 |
-| :-------------------: | :----------: | :--: | :-------: | :-----: | :-----: | :---: |
-|         Llama         |   7B - 65B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Llama2         |   7B - 70B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Llama3         |   8B, 70B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       Llama3.1        |   8B, 70B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       Llama3.2        |      3B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       InternLM        |   7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       InternLM2       |   7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|      InternLM2.5      |      7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|  InternLM-XComposer2  | 7B, 4khd-7B  | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-| InternLM-XComposer2.5 |      7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|         Qwen          |  1.8B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Qwen1.5        | 1.8B - 110B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|         Qwen2         |  1.5B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Mistral        |      7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Mixtral        | 8x7B, 8x22B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Qwen-VL        |      7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|      DeepSeek-VL      |      7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|       Baichuan        |      7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       Baichuan2       |      7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|      Code Llama       |   7B - 34B   | LLM  |    Yes    |   Yes   |   Yes   |  No   |
-|          YI           |   6B - 34B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|    LLaVA(1.5,1.6)     |   7B - 34B   | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|       InternVL        | v1.1 - v1.5  | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|       InternVL2       | 2B, 8B - 76B | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|        ChemVLM        |   8B - 26B   | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-| MiniCPM-Llama3-V-2_5  |      -       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|     MiniCPM-V-2_6     |      -       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|    MiniGeminiLlama    |      7B      | MLLM |    Yes    |    -    |    -    |  Yes  |
-|         GLM4          |      9B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       CodeGeeX4       |      9B      | LLM  |    Yes    |   Yes   |   Yes   |   -   |
-|         Molmo         |   7B-D,72B   | MLLM |    Yes    |   Yes   |   Yes   |  NO   |
+|         Model         |      Size      | Type | FP16/BF16 | KV INT8 | KV INT4 | W4A16 |
+| :-------------------: | :------------: | :--: | :-------: | :-----: | :-----: | :---: |
+|         Llama         |    7B - 65B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Llama2         |    7B - 70B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Llama3         |    8B, 70B     | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       Llama3.1        |    8B, 70B     | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       Llama3.2        |     1B, 3B     | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       InternLM        |    7B - 20B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       InternLM2       |    7B - 20B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|      InternLM2.5      |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|  InternLM-XComposer2  |  7B, 4khd-7B   | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+| InternLM-XComposer2.5 |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|         Qwen          |   1.8B - 72B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Qwen1.5        |  1.8B - 110B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|         Qwen2         |   0.5B - 72B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Mistral        |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Mixtral        |  8x7B, 8x22B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Qwen-VL        |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|      DeepSeek-VL      |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|       Baichuan        |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       Baichuan2       |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|      Code Llama       |    7B - 34B    | LLM  |    Yes    |   Yes   |   Yes   |  No   |
+|          YI           |    6B - 34B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|    LLaVA(1.5,1.6)     |    7B - 34B    | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|       InternVL        |  v1.1 - v1.5   | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|       InternVL2       | 1-2B, 8B - 76B | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|        ChemVLM        |    8B - 26B    | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+| MiniCPM-Llama3-V-2_5  |       -        | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|     MiniCPM-V-2_6     |       -        | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|    MiniGeminiLlama    |       7B       | MLLM |    Yes    |    -    |    -    |  Yes  |
+|         GLM4          |       9B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       CodeGeeX4       |       9B       | LLM  |    Yes    |   Yes   |   Yes   |   -   |
+|         Molmo         |    7B-D,72B    | MLLM |    Yes    |   Yes   |   Yes   |  NO   |
 
 "-" means not verified yet.
 
diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
index d8bf9a1ad8..908f9a17f5 100644
--- a/docs/zh_cn/supported_models/supported_models.md
+++ b/docs/zh_cn/supported_models/supported_models.md
@@ -4,39 +4,39 @@
 
 ## TurboMind CUDA 平台
 
-|         Model         |     Size     | Type | FP16/BF16 | KV INT8 | KV INT4 | W4A16 |
-| :-------------------: | :----------: | :--: | :-------: | :-----: | :-----: | :---: |
-|         Llama         |   7B - 65B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Llama2         |   7B - 70B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Llama3         |   8B, 70B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       Llama3.1        |   8B, 70B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       Llama3.2        |      3B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       InternLM        |   7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       InternLM2       |   7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|      InternLM2.5      |      7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|  InternLM-XComposer2  | 7B, 4khd-7B  | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-| InternLM-XComposer2.5 |      7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|         Qwen          |  1.8B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Qwen1.5        | 1.8B - 110B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|         Qwen2         |  1.5B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Mistral        |      7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Mixtral        | 8x7B, 8x22B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Qwen-VL        |      7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|      DeepSeek-VL      |      7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|       Baichuan        |      7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       Baichuan2       |      7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|      Code Llama       |   7B - 34B   | LLM  |    Yes    |   Yes   |   Yes   |  No   |
-|          YI           |   6B - 34B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|    LLaVA(1.5,1.6)     |   7B - 34B   | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|       InternVL        | v1.1 - v1.5  | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|       InternVL2       | 2B, 8B - 76B | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|        ChemVLM        |   8B - 26B   | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-| MiniCPM-Llama3-V-2_5  |      -       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|     MiniCPM-V-2_6     |      -       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|    MiniGeminiLlama    |      7B      | MLLM |    Yes    |    -    |    -    |  Yes  |
-|         GLM4          |      9B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       CodeGeeX4       |      9B      | LLM  |    Yes    |   Yes   |   Yes   |   -   |
-|         Molmo         |   7B-D,72B   | MLLM |    Yes    |   Yes   |   Yes   |  NO   |
+|         Model         |      Size      | Type | FP16/BF16 | KV INT8 | KV INT4 | W4A16 |
+| :-------------------: | :------------: | :--: | :-------: | :-----: | :-----: | :---: |
+|         Llama         |    7B - 65B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Llama2         |    7B - 70B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Llama3         |    8B, 70B     | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       Llama3.1        |    8B, 70B     | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       Llama3.2        |     1B, 3B     | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       InternLM        |    7B - 20B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       InternLM2       |    7B - 20B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|      InternLM2.5      |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|  InternLM-XComposer2  |  7B, 4khd-7B   | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+| InternLM-XComposer2.5 |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|         Qwen          |   1.8B - 72B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Qwen1.5        |  1.8B - 110B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|         Qwen2         |   0.5B - 72B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Mistral        |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Mixtral        |  8x7B, 8x22B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Qwen-VL        |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|      DeepSeek-VL      |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|       Baichuan        |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       Baichuan2       |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|      Code Llama       |    7B - 34B    | LLM  |    Yes    |   Yes   |   Yes   |  No   |
+|          YI           |    6B - 34B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|    LLaVA(1.5,1.6)     |    7B - 34B    | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|       InternVL        |  v1.1 - v1.5   | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|       InternVL2       | 1-2B, 8B - 76B | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|        ChemVLM        |    8B - 26B    | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+| MiniCPM-Llama3-V-2_5  |       -        | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|     MiniCPM-V-2_6     |       -        | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|    MiniGeminiLlama    |       7B       | MLLM |    Yes    |    -    |    -    |  Yes  |
+|         GLM4          |       9B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       CodeGeeX4       |       9B       | LLM  |    Yes    |   Yes   |   Yes   |   -   |
+|         Molmo         |    7B-D,72B    | MLLM |    Yes    |   Yes   |   Yes   |  NO   |
 
 “-” 表示还没有验证。
 

From 324237b2c9e223c2392088cecb57b3703d1f7d54 Mon Sep 17 00:00:00 2001
From: zhoushenglong <87467364+Reinerzhou@users.noreply.github.com>
Date: Thu, 21 Nov 2024 19:01:12 +0800
Subject: [PATCH 19/40] [Feature] support minicpm-v_2_6 for pytorch engine.
 (#2767)

* support minicpmv_2_6.

* update supported_models.

* update supported_models.
---
 docs/en/supported_models/supported_models.md  |   1 +
 .../supported_models/supported_models.md      |   1 +
 lmdeploy/pytorch/models/minicpmv26.py         | 430 ++++++++++++++++++
 lmdeploy/pytorch/models/module_map.py         |   6 +
 lmdeploy/pytorch/supported_models.py          |   2 +
 5 files changed, 440 insertions(+)
 create mode 100644 lmdeploy/pytorch/models/minicpmv26.py

diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
index 283ce596f6..da52241253 100644
--- a/docs/en/supported_models/supported_models.md
+++ b/docs/en/supported_models/supported_models.md
@@ -72,6 +72,7 @@ The TurboMind engine doesn't support window attention. Therefore, for models tha
 |  DeepSeek-MoE  |     16B     | LLM  |    Yes    |   No    |   No    |  No  |  No   |
 |  DeepSeek-V2   |  16B, 236B  | LLM  |    Yes    |   No    |   No    |  No  |  No   |
 |    MiniCPM3    |     4B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+| MiniCPM-V-2_6  |     8B      | LLM  |    Yes    |   No    |   No    |  No  |  Yes  |
 |     Gemma      |    2B-7B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |      Dbrx      |    132B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |   StarCoder2   |   3B-15B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
index 908f9a17f5..502e91b6d3 100644
--- a/docs/zh_cn/supported_models/supported_models.md
+++ b/docs/zh_cn/supported_models/supported_models.md
@@ -72,6 +72,7 @@ turbomind 引擎不支持 window attention。所以，对于应用了 window att
 |  DeepSeek-MoE  |     16B     | LLM  |    Yes    |   No    |   No    |  No  |  No   |
 |  DeepSeek-V2   |  16B, 236B  | LLM  |    Yes    |   No    |   No    |  No  |  No   |
 |    MiniCPM3    |     4B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+| MiniCPM-V-2_6  |     8B      | LLM  |    Yes    |   No    |   No    |  No  |  Yes  |
 |     Gemma      |    2B-7B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |      Dbrx      |    132B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |   StarCoder2   |   3B-15B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
diff --git a/lmdeploy/pytorch/models/minicpmv26.py b/lmdeploy/pytorch/models/minicpmv26.py
new file mode 100644
index 0000000000..725e97d9d7
--- /dev/null
+++ b/lmdeploy/pytorch/models/minicpmv26.py
@@ -0,0 +1,430 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Iterable, List, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers.configuration_utils import PretrainedConfig
+
+from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
+from lmdeploy.pytorch.nn import (ApplyRotaryEmb, Attention, RMSNorm,
+                                 SiluAndMul, build_rotary_embedding,
+                                 build_rotary_params)
+from lmdeploy.pytorch.nn.linear import (build_merged_colwise_linear,
+                                        build_qkv_proj, build_rowwise_linear)
+from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
+
+from .utils.cudagraph import CudaGraphMixin
+
+
+class MiniCPMV26Attention(nn.Module):
+    """Rewrite module of MiniCPMV26Attention."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        quantization_config = getattr(config, 'quantization_config', None)
+        num_heads = config.num_attention_heads
+        num_key_value_heads = config.num_key_value_heads
+        hidden_size = config.hidden_size
+        head_dim = getattr(config, 'head_dim', hidden_size // num_heads)
+
+        # packed qkv
+        self.qkv_proj = build_qkv_proj(
+            hidden_size,
+            num_q_heads=num_heads,
+            num_kv_heads=num_key_value_heads,
+            head_size=head_dim,
+            bias=True,
+            quant_config=quantization_config,
+            dtype=dtype,
+            device=device,
+        )
+
+        # rotary embedding
+        self.apply_rotary_pos_emb = ApplyRotaryEmb()
+
+        # attention
+        self.attn_fwd = Attention(
+            num_heads,
+            head_dim,
+            num_kv_heads=num_key_value_heads,
+            v_head_size=head_dim,
+            sliding_window=config.sliding_window,
+        )
+
+        # o_proj
+        self.o_proj = build_rowwise_linear(num_heads * head_dim,
+                                           hidden_size,
+                                           bias=False,
+                                           quant_config=quantization_config,
+                                           dtype=dtype,
+                                           device=device,
+                                           is_tp=True)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attn_metadata: Any = None,
+    ):
+        """Rewrite of LlamaAttention.forward."""
+        # qkv proj
+        qkv_states = self.qkv_proj(hidden_states)
+        # (-1, heads, head_dim)
+        qkv_states = qkv_states.flatten(0, -2)
+        query_states, key_states, value_states = self.qkv_proj.split_qkv(
+            qkv_states)
+
+        # apply rotary embedding
+        cos, sin = rotary_pos_emb
+        query_states, key_states = self.apply_rotary_pos_emb(
+            query_states,
+            key_states,
+            cos,
+            sin,
+            inplace=True,
+        )
+
+        # attention
+        attn_output = self.attn_fwd(
+            query_states,
+            key_states,
+            value_states,
+            past_key_value[0],
+            past_key_value[1],
+            attn_metadata,
+            k_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[2],
+            v_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[3],
+            inplace=True,
+        )
+        attn_output = attn_output.reshape(*hidden_states.shape[:-1], -1)
+
+        # o proj
+        attn_output = self.o_proj(attn_output)
+        return attn_output
+
+
+class MiniCPMV26MLP(nn.Module):
+    """mlp."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        quantization_config = getattr(config, 'quantization_config', None)
+        # gate up
+        self.gate_up_proj = build_merged_colwise_linear(
+            config.hidden_size,
+            [config.intermediate_size, config.intermediate_size],
+            bias=False,
+            dtype=dtype,
+            device=device,
+            quant_config=quantization_config,
+            is_tp=True,
+        )
+
+        # silu and mul
+        self.act_fn = SiluAndMul(inplace=True)
+
+        # down
+        self.down_proj = build_rowwise_linear(config.intermediate_size,
+                                              config.hidden_size,
+                                              bias=False,
+                                              quant_config=quantization_config,
+                                              dtype=dtype,
+                                              device=device,
+                                              is_tp=True)
+
+    def forward(self, x):
+        """forward."""
+        gate_up = self.gate_up_proj(x)
+        act = self.act_fn(gate_up)
+        return self.down_proj(act)
+
+
+class MiniCPMV26DecoderLayer(nn.Module):
+    """decoder layer."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 layer_idx: int,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.layer_idx = layer_idx
+        quantization_config = getattr(config, 'quantization_config', None)
+
+        # build attention layer
+        self.self_attn = MiniCPMV26Attention(config,
+                                             dtype=dtype,
+                                             device=device)
+
+        # build MLP
+        self.mlp = MiniCPMV26MLP(config, dtype=dtype, device=device)
+
+        # build input layer norm
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       config.rms_norm_eps,
+                                       quant_config=quantization_config,
+                                       dtype=dtype,
+                                       device=device)
+
+        # build attention layer norm
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size,
+            config.rms_norm_eps,
+            quant_config=quantization_config,
+            dtype=dtype,
+            device=device)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: Optional[List[torch.FloatTensor]],
+        residual: Optional[torch.Tensor] = None,
+        attn_metadata: Any = None,
+    ):
+
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+
+        # Self Attention
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            rotary_pos_emb=rotary_pos_emb,
+            past_key_value=past_key_value,
+            attn_metadata=attn_metadata,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+
+        outputs = (hidden_states, residual)
+        return outputs
+
+
+class MiniCPMV26Model(nn.Module):
+    """model."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        quantization_config = getattr(config, 'quantization_config', None)
+
+        self.embed_tokens = nn.Embedding(config.vocab_size,
+                                         config.hidden_size,
+                                         self.padding_idx,
+                                         dtype=dtype,
+                                         device=device)
+
+        # build all decode layers
+        self.layers = nn.ModuleList([
+            MiniCPMV26DecoderLayer(config,
+                                   layer_idx,
+                                   dtype=dtype,
+                                   device=device)
+            for layer_idx in range(config.num_hidden_layers)
+        ])
+
+        # build norm
+        self.norm = RMSNorm(config.hidden_size,
+                            config.rms_norm_eps,
+                            quant_config=quantization_config,
+                            dtype=dtype,
+                            device=device)
+
+        # build rotary embedding
+        rope_params = build_rotary_params(config)
+        rope_dim = config.hidden_size // config.num_attention_heads
+        rope_max_pos_emb = config.max_position_embeddings
+        rope_base = config.rope_theta
+        self.rotary_emb = build_rotary_embedding(
+            rope_dim,
+            rope_max_pos_emb,
+            rope_base,
+            **rope_params,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        attn_metadata: Any = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+    ):
+        """Rewrite of LlamaModel.forward."""
+
+        # token embedding
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        hidden_states = inputs_embeds
+
+        # rotary embedding
+        cos, sin = self.rotary_emb(hidden_states, position_ids)
+        cos, sin = cos[0], sin[0]
+        rotary_pos_emb = (cos, sin)
+
+        # decoding
+        residual = None
+        for idx, decoder_layer in enumerate(self.layers):
+            past_key_value = past_key_values[idx]
+            hidden_states, residual = decoder_layer(
+                hidden_states,
+                rotary_pos_emb=rotary_pos_emb,
+                past_key_value=past_key_value,
+                residual=residual,
+                attn_metadata=attn_metadata,
+            )
+
+        # norm
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+    def get_input_embeddings(self):
+        """get input embeddings."""
+        return self.embed_tokens
+
+
+class MiniCPMVForCausalLM(nn.Module, CudaGraphMixin):
+    """rewrote model of MiniCPMVForCausalLM."""
+
+    packed_modules_mapping = {
+        'gate_up_proj': [
+            'gate_proj',
+            'up_proj',
+        ],
+    }
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 ctx_mgr: StepContextManager,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.config = config
+        self.ctx_mgr = ctx_mgr
+        # build model
+        self.model = MiniCPMV26Model(config, dtype=dtype, device=device)
+        # build lm_head
+        self.lm_head = build_rowwise_linear(config.hidden_size,
+                                            config.vocab_size,
+                                            bias=False,
+                                            dtype=dtype,
+                                            device=device)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        past_key_values: List[List[torch.Tensor]],
+        attn_metadata: Any = None,
+        inputs_embeds: torch.Tensor = None,
+        **kwargs,
+    ):
+        """model forward, return logits."""
+        hidden_states = self.model(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            attn_metadata=attn_metadata,
+            inputs_embeds=inputs_embeds,
+        )
+
+        return hidden_states
+
+    def get_logits(self, hidden_states: torch.Tensor):
+        """compute logits of the model output."""
+        return self.lm_head(hidden_states)
+
+    def update_weights(self):
+        """update weights."""
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+
+    def get_input_embeddings(self):
+        """get input embeddings."""
+        return self.model.get_input_embeddings()
+
+    def prepare_inputs_for_generation(
+        self,
+        past_key_values: List[List[torch.Tensor]],
+        inputs_embeds: Optional[torch.Tensor] = None,
+        context: StepContext = None,
+    ):
+        """prepare input."""
+        # get input_ids, position_ids and attention metadatas
+        input_ids = context.input_ids
+        position_ids = context.position_ids
+        attn_metadata = context.attn_metadata
+
+        # process vision embeddings
+        vision_embeddings = context.input_embeddings
+        vision_embedding_indexing = context.input_embedding_indexing
+        if vision_embeddings is not None and len(vision_embeddings) > 0:
+            if inputs_embeds is None:
+                inputs_embeds = self.get_input_embeddings()(input_ids)
+            inputs_embeds[:,
+                          vision_embedding_indexing, :] = vision_embeddings.to(
+                              inputs_embeds)
+
+        # inputs of forward
+        return dict(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            attn_metadata=attn_metadata,
+            inputs_embeds=inputs_embeds,
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        """load weights."""
+        # modify from vllm
+        stacked_params_mapping = [
+            ('.qkv_proj', '.q_proj', 'q'),
+            ('.qkv_proj', '.k_proj', 'k'),
+            ('.qkv_proj', '.v_proj', 'v'),
+            ('.gate_up_proj', '.gate_proj', 0),
+            ('.gate_up_proj', '.up_proj', 1),
+        ]
+
+        params_dict = dict(self.named_parameters(prefix='llm'))
+        for name, loaded_weight in weights:
+            if 'vpm' in name or 'resampler' in name:
+                continue
+            if 'rotary_emb.inv_freq' in name:
+                continue
+            if ('rotary_emb.cos_cached' in name
+                    or 'rotary_emb.sin_cached' in name):
+                continue
+            if self.config.tie_word_embeddings and 'lm_head.weight' in name:
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                load_weight(param, loaded_weight, shard_id=shard_id)
+                break
+            else:
+                param = params_dict[name]
+                load_weight(param, loaded_weight)
diff --git a/lmdeploy/pytorch/models/module_map.py b/lmdeploy/pytorch/models/module_map.py
index e6b5f6e29e..1059bfee4e 100644
--- a/lmdeploy/pytorch/models/module_map.py
+++ b/lmdeploy/pytorch/models/module_map.py
@@ -173,6 +173,12 @@
     f'{LMDEPLOY_PYTORCH_MODEL_PATH}.minicpm3.MiniCPM3ForCausalLM',
 })
 
+# minicpmv2_6
+MODULE_MAP.update({
+    'MiniCPMV':
+    f'{LMDEPLOY_PYTORCH_MODEL_PATH}.minicpmv26.MiniCPMVForCausalLM',
+})
+
 # mllama
 MODULE_MAP.update({
     'MllamaForConditionalGeneration':
diff --git a/lmdeploy/pytorch/supported_models.py b/lmdeploy/pytorch/supported_models.py
index 21418188dd..7fa568651b 100644
--- a/lmdeploy/pytorch/supported_models.py
+++ b/lmdeploy/pytorch/supported_models.py
@@ -70,6 +70,8 @@
     PhiMoEForCausalLM=True,
     # mllama
     MllamaForConditionalGeneration=True,
+    # MiniCPM-V-2_6
+    MiniCPMVForCausalLM=True,
 )
 
 

From b4834ea4c6d9add7253092ff3271f0add86bab44 Mon Sep 17 00:00:00 2001
From: AllentDan <41138331+AllentDan@users.noreply.github.com>
Date: Mon, 25 Nov 2024 18:21:12 +0800
Subject: [PATCH 20/40] Support qwen2-vl AWQ quantization (#2787)

* Support qwen2-vl AWQ quantization

* Update config.yaml

---------

Co-authored-by: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
---
 autotest/config.yaml               |  4 +--
 lmdeploy/lite/apis/calibrate.py    |  3 ++
 lmdeploy/lite/quantization/awq.py  |  9 ++++++
 lmdeploy/lite/utils/batch_split.py |  8 +++++
 lmdeploy/vl/model/qwen2.py         | 52 ++++++++++++++++--------------
 5 files changed, 49 insertions(+), 27 deletions(-)

diff --git a/autotest/config.yaml b/autotest/config.yaml
index 6c92d2cf0b..e31a40f0d4 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -163,8 +163,6 @@ pytorch_base_model:
 
 turbomind_quatization:
     no_awq:
-        - Qwen/Qwen2-VL-2B-Instruct
-        - Qwen/Qwen2-VL-7B-Instruct
         - mistralai/Mistral-7B-Instruct-v0.3
         - deepseek-ai/deepseek-coder-1.3b-instruct
         - codellama/CodeLlama-7b-Instruct-hf
@@ -189,6 +187,8 @@ pytorch_quatization:
         - Qwen/Qwen2-7B-Instruct
         - Qwen/Qwen2-1.5B-Instruct
         - microsoft/Phi-3-mini-4k-instruct
+        - Qwen/Qwen2-VL-2B-Instruct
+        - Qwen/Qwen2-VL-7B-Instruct
     w8a8:
         - meta-llama/Meta-Llama-3-8B-Instruct
         - meta-llama/Llama-2-7b-chat-hf
diff --git a/lmdeploy/lite/apis/calibrate.py b/lmdeploy/lite/apis/calibrate.py
index cd5178793d..0780e93594 100644
--- a/lmdeploy/lite/apis/calibrate.py
+++ b/lmdeploy/lite/apis/calibrate.py
@@ -26,6 +26,7 @@
     'Phi3ForCausalLM': 'Phi3DecoderLayer',
     'ChatGLMForConditionalGeneration': 'GLMBlock',
     'MixtralForCausalLM': 'MixtralDecoderLayer',
+    'Qwen2VLForConditionalGeneration': 'Qwen2VLDecoderLayer',
 }
 
 NORM_TYPE_MAP = {
@@ -42,6 +43,7 @@
     'Phi3ForCausalLM': 'Phi3RMSNorm',
     'ChatGLMForConditionalGeneration': 'RMSNorm',
     'MixtralForCausalLM': 'MixtralRMSNorm',
+    'Qwen2VLForConditionalGeneration': 'Qwen2RMSNorm',
 }
 
 HEAD_NAME_MAP = {
@@ -58,6 +60,7 @@
     'Phi3ForCausalLM': 'lm_head',
     'ChatGLMForConditionalGeneration': 'output_layer',
     'MixtralForCausalLM': 'lm_head',
+    'Qwen2VLForConditionalGeneration': 'lm_head',
 }
 
 
diff --git a/lmdeploy/lite/quantization/awq.py b/lmdeploy/lite/quantization/awq.py
index 068ad9357e..2efe41b6da 100644
--- a/lmdeploy/lite/quantization/awq.py
+++ b/lmdeploy/lite/quantization/awq.py
@@ -45,6 +45,11 @@
         ['self_attn.k_proj', 'self_attn.q_proj', 'self_attn.v_proj'],
         'post_attention_layernorm':
         ['block_sparse_moe.experts.{i}.w1', 'block_sparse_moe.experts.{i}.w3']
+    },
+    'Qwen2VLDecoderLayer': {
+        'input_layernorm':
+        ['self_attn.k_proj', 'self_attn.q_proj', 'self_attn.v_proj'],
+        'post_attention_layernorm': ['mlp.gate_proj', 'mlp.up_proj']
     }
 }
 
@@ -83,6 +88,10 @@
     'MixtralDecoderLayer': {
         'self_attn.v_proj': ['self_attn.o_proj'],
         'block_sparse_moe.experts.{i}.w3': ['block_sparse_moe.experts.{i}.w2']
+    },
+    'Qwen2VLDecoderLayer': {
+        'self_attn.v_proj': ['self_attn.o_proj'],
+        'mlp.up_proj': ['mlp.down_proj']
     }
 }
 
diff --git a/lmdeploy/lite/utils/batch_split.py b/lmdeploy/lite/utils/batch_split.py
index 3bd208f609..4e30f61d34 100644
--- a/lmdeploy/lite/utils/batch_split.py
+++ b/lmdeploy/lite/utils/batch_split.py
@@ -46,6 +46,14 @@ def split_decoder_layer_inputs(
         for name, val in kwargs.items():
             if isinstance(val, torch.Tensor) and val.size(0) == bs:
                 new_kwargs[name] = val[i:i + batch_size]
+            elif isinstance(val, torch.Tensor) and len(
+                    val.shape) > 1 and val.size(1) == bs:  # qwen2-vl
+                new_kwargs[name] = val[:, i:i + batch_size]
+            elif name == 'position_embeddings' and isinstance(
+                    val, Tuple) and len(
+                        val[0].shape) > 1 and val[0].size(1) == bs:  # qwen2-vl
+                new_kwargs[name] = (val[0][:, i:i + batch_size],
+                                    val[1][:, i:i + batch_size])
             else:
                 new_kwargs[name] = val
 
diff --git a/lmdeploy/vl/model/qwen2.py b/lmdeploy/vl/model/qwen2.py
index 2e53d8e0f0..3eb3c1541c 100644
--- a/lmdeploy/vl/model/qwen2.py
+++ b/lmdeploy/vl/model/qwen2.py
@@ -33,33 +33,35 @@ class Qwen2VLModel(VisonModel):
 
     def build_model(self):
         check_qwen_vl_deps_install()
-
-        from accelerate import init_empty_weights
-        with init_empty_weights():
-            config = self.hf_config
-            config.quantization_config = {}  # disable vision part quantization
-            # disable accelerate check_tied_parameters_in_config
-            # for Qwen2-VL-2B-Instruct
-            config.tie_word_embeddings = False
-
-            from transformers import Qwen2VLForConditionalGeneration
-            model = Qwen2VLForConditionalGeneration._from_config(config)
-            if not self.with_llm:
+        from transformers import Qwen2VLForConditionalGeneration
+        if self.with_llm:
+            model = Qwen2VLForConditionalGeneration.from_pretrained(
+                self.hf_config._name_or_path, trust_remote_code=True)
+            model.half()
+            self.vl_model = model
+        else:
+            from accelerate import init_empty_weights
+            with init_empty_weights():
+                config = self.hf_config
+                config.quantization_config = {
+                }  # disable vision part quantization
+                # disable accelerate check_tied_parameters_in_config
+                # for Qwen2-VL-2B-Instruct
+                config.tie_word_embeddings = False
+
+                model = Qwen2VLForConditionalGeneration._from_config(config)
                 del model.model
                 del model.lm_head
-            else:
-                self.vl_model = model
-            model.half()
-
-        from accelerate import load_checkpoint_and_dispatch
-        with disable_logging():
-            load_checkpoint_and_dispatch(
-                model=model,
-                checkpoint=self.model_path,
-                device_map='auto' if not self.with_llm else {'': 'cpu'},
-                max_memory=self.max_memory,
-                no_split_module_classes=['Qwen2VLVisionBlock'],
-                dtype=torch.half)
+                model.half()
+            from accelerate import load_checkpoint_and_dispatch
+            with disable_logging():
+                load_checkpoint_and_dispatch(
+                    model=model,
+                    checkpoint=self.model_path,
+                    device_map='auto' if not self.with_llm else {'': 'cpu'},
+                    max_memory=self.max_memory,
+                    no_split_module_classes=['Qwen2VLVisionBlock'],
+                    dtype=torch.half)
 
         self.model = model.eval()
 

From f13c0f93e82873b18dbc3daabb2f89ed16b4ea21 Mon Sep 17 00:00:00 2001
From: Wei Tao <1136862851@qq.com>
Date: Mon, 25 Nov 2024 18:33:24 +0800
Subject: [PATCH 21/40] [dlinfer] Fix qwenvl rope error for dlinfer backend
 (#2795)

---
 lmdeploy/pytorch/backends/dlinfer/rotary_embedding.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lmdeploy/pytorch/backends/dlinfer/rotary_embedding.py b/lmdeploy/pytorch/backends/dlinfer/rotary_embedding.py
index fab6e510f5..ed807d66b0 100644
--- a/lmdeploy/pytorch/backends/dlinfer/rotary_embedding.py
+++ b/lmdeploy/pytorch/backends/dlinfer/rotary_embedding.py
@@ -24,7 +24,8 @@ def _rotary_embedding_fwd(position_ids: torch.Tensor,
     else:
         position_ids = position_ids.float()
 
-    inv_freq_expanded = inv_freq.view(1, -1, 1)
+    inv_freq_expanded = inv_freq.view(1, -1, 1).expand(position_ids.size(0),
+                                                       -1, 1)
     position_ids_expanded = position_ids.unsqueeze(1)
 
     tmp = torch.bmm(inv_freq_expanded, position_ids_expanded)

From b5b31791a76cf37ac3856d0394fa3eb5502217a6 Mon Sep 17 00:00:00 2001
From: jinminxi104 <jinminxi104@hotmail.com>
Date: Mon, 25 Nov 2024 20:30:58 +0800
Subject: [PATCH 22/40] Optimize update_step_ctx on Ascend (#2804)

* opt update_ctx for ascend

* fix lint
---
 .../backends/dlinfer/ascend/op_backend.py     | 61 +++++++++++--------
 1 file changed, 36 insertions(+), 25 deletions(-)

diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
index 79e5288364..b6f544510b 100644
--- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
+++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
@@ -71,31 +71,42 @@ def get_total_slots():
         max_q_seq_len = max(q_seqlens_list)
         max_kv_seq_len = max(kv_seqlens_list)
 
-        for i in range(step_context.q_start_loc.size(0)):
-            q_seq_len = q_seqlens_list[i]
-            kv_seq_len = kv_seqlens_list[i]
-
-            # collect kv start indices.
-            history_length = kv_seq_len - q_seq_len
-            total_slots = get_total_slots()
-            slot_tables = total_slots[step_context.block_offsets[i]].view(-1)
-            slots = slot_tables[history_length:kv_seq_len]
-            kv_start_indices.append(slots)
-
-            # collect attention mask of paged_prefill attention stage.
-            if not (step_context.is_decoding or is_unpaged_prefill):
-                single_attention_mask = torch.logical_not(
-                    torch.tril(
-                        torch.ones(q_seq_len,
-                                   step_context.block_offsets.shape[1] *
-                                   block_size,
-                                   dtype=torch.bool,
-                                   device=step_context.block_offsets.device),
-                        diagonal=kv_seq_len - q_seq_len,
-                    ))
-                attention_mask.append(single_attention_mask)
-
-        kv_start_indices = torch.cat(kv_start_indices)
+        if step_context.is_decoding:
+            # collect kv_start_indices without using a for-loop,
+            # (fill kv-cache for just ONE token during the decoding phase)
+            idx = (step_context.kv_seqlens - 1) % block_size
+            block_num = (step_context.kv_seqlens - 1) // block_size
+            last_block = step_context.block_offsets.gather(
+                1, block_num.view(-1, 1)).view(-1)
+            kv_start_indices = last_block * block_size + idx
+        else:
+            for i in range(step_context.q_start_loc.size(0)):
+                q_seq_len = q_seqlens_list[i]
+                kv_seq_len = kv_seqlens_list[i]
+
+                # collect kv start indices during the prefill phase.
+                history_length = kv_seq_len - q_seq_len
+                total_slots = get_total_slots()
+                slot_tables = total_slots[step_context.block_offsets[i]].view(
+                    -1)
+                slots = slot_tables[history_length:kv_seq_len]
+                kv_start_indices.append(slots)
+
+                # collect attention mask of paged_prefill attention stage.
+                if not is_unpaged_prefill:
+                    single_attention_mask = torch.logical_not(
+                        torch.tril(
+                            torch.ones(
+                                q_seq_len,
+                                step_context.block_offsets.shape[1] *
+                                block_size,
+                                dtype=torch.bool,
+                                device=step_context.block_offsets.device),
+                            diagonal=kv_seq_len - q_seq_len,
+                        ))
+                    attention_mask.append(single_attention_mask)
+
+            kv_start_indices = torch.cat(kv_start_indices)
 
         if step_context.is_decoding:
             # prepare some params of paged_decode attention stage.

From 3913eadfbb60c2d73832f25415d5ebaef62280a4 Mon Sep 17 00:00:00 2001
From: q yao <streetyao@live.com>
Date: Wed, 27 Nov 2024 19:17:16 +0800
Subject: [PATCH 23/40] disable prefix-caching for vl model (#2825)

---
 lmdeploy/api.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/lmdeploy/api.py b/lmdeploy/api.py
index e66d73754a..2b4204a53b 100644
--- a/lmdeploy/api.py
+++ b/lmdeploy/api.py
@@ -69,7 +69,11 @@ def pipeline(model_path: str,
             if backend_config is not None else None
         model_path = get_model(model_path, download_dir, revision)
 
-    _, pipeline_class = get_task(model_path)
+    task, pipeline_class = get_task(model_path)
+    if task == 'vlm':
+        if backend_config.enable_prefix_caching:
+            backend_config.enable_prefix_caching = False
+            logger.warning('VLM does not support prefix caching.')
 
     if type(backend_config) is not PytorchEngineConfig:
         # set auto backend mode

From f88fbc3c31961b1cb159e041dcb657592fb2da21 Mon Sep 17 00:00:00 2001
From: Li Zhang <lzhang329@gmail.com>
Date: Fri, 29 Nov 2024 10:37:42 +0800
Subject: [PATCH 24/40] Add DeepSeek-V2 support (#2763)

* add qwen2-moe

* eliminate `inter_size_` from ffn layer

* clean up

* fix lint

* clean up

* layer-wise `inter_size` & `expert_num`

* add head dim 192

* refactor weight processing

* deepseek-v2-lite

* deepseek-v2

* fix lint

* fix lint

* fix ut

* Update config.yaml

* Update config.yaml

* fix mixtral

* fix moe gating & config parsing

* fix yarn for deepseek-v2

* fix `copy_from`

* fix rms norm, rotary embedding & deepseek v2 attention

* remove debug code

---------

Co-authored-by: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
---
 autotest/config.yaml                          |   2 +
 examples/cpp/llama/llama_triton_example.cc    |   4 +-
 lmdeploy/turbomind/deploy/config.py           |  23 +-
 lmdeploy/turbomind/deploy/converter.py        |   7 +-
 lmdeploy/turbomind/deploy/loader.py           |  21 ++
 lmdeploy/turbomind/deploy/module.py           |  82 ++++-
 .../turbomind/deploy/source_model/__init__.py |   1 +
 .../deploy/source_model/deepseek2.py          | 134 ++++++++
 .../turbomind/deploy/source_model/mixtral.py  |   2 +-
 .../turbomind/deploy/source_model/qwen.py     |   2 +-
 .../turbomind/deploy/target_model/base.py     |  20 +-
 lmdeploy/turbomind/supported_models.py        |   1 +
 src/turbomind/kernels/CMakeLists.txt          |   1 +
 .../kernels/attention/CMakeLists.txt          |   2 +
 src/turbomind/kernels/attention/attention.cu  |   6 +
 .../attention/codegen/attention_sm80_192.cu   |  16 +
 .../attention/codegen/decoding_sm80_192.cu    |  20 ++
 src/turbomind/kernels/attention/decoding.cu   |  17 +-
 .../kernels/attention/decoding_config.h       |  12 +-
 src/turbomind/kernels/attention/impl_16816.h  |  61 ++--
 src/turbomind/kernels/attention/impl_81616.h  |   2 +-
 src/turbomind/kernels/attention/impl_simt.h   |  14 +-
 .../kernels/attention/kv_cache_utils_v2.cu    |  12 +-
 .../kernels/attention/mainloop_sm80.h         |  17 +-
 src/turbomind/kernels/attention/reduce.cu     |   6 +-
 .../kernels/attention/reduce_kernel.h         |   7 +-
 .../kernels/attention/rotary_embedding.h      |  17 +
 .../kernels/attention/test_attention.cu       |  12 +-
 src/turbomind/kernels/core/array_ops.h        |   2 +-
 src/turbomind/kernels/core/math.h             |   8 +
 src/turbomind/kernels/core/thread_map.h       |   3 +-
 .../flash_attention2/CMakeLists.txt           |   4 +-
 .../flash_fwd_launch_template.h               |   2 +-
 .../flash_attention2/static_switch.h          |  12 +
 src/turbomind/kernels/gemm/context.h          |  13 +-
 src/turbomind/kernels/gemm/convert_v2.cu      |  41 ++-
 src/turbomind/kernels/gemm/moe_utils_v2.cu    | 195 +++++++++--
 src/turbomind/kernels/gemm/moe_utils_v2.h     |   4 +
 .../kernels/gemm/test/test_moe_utils.cu       |  86 +----
 src/turbomind/kernels/gemm/test/testbed.h     |   4 +-
 src/turbomind/kernels/gemm/unpack.cu          |  34 +-
 src/turbomind/kernels/norm/CMakeLists.txt     |   5 +
 src/turbomind/kernels/norm/rms_norm.cu        | 235 +++++++++++++
 src/turbomind/kernels/norm/rms_norm.h         |  21 ++
 src/turbomind/models/llama/CMakeLists.txt     |   4 +-
 src/turbomind/models/llama/LlamaBatch.cc      |   6 +-
 .../models/llama/LlamaDecoderLayerWeight.cc   | 325 ++++++++----------
 .../models/llama/LlamaDecoderLayerWeight.h    |  39 +--
 src/turbomind/models/llama/LlamaDenseWeight.h | 265 +++++++++-----
 src/turbomind/models/llama/LlamaFfnLayer.cc   |  26 +-
 src/turbomind/models/llama/LlamaFfnLayer.h    |   9 +-
 src/turbomind/models/llama/LlamaV2.cc         |   1 -
 src/turbomind/models/llama/LlamaV2.h          |   1 -
 src/turbomind/models/llama/LlamaWeight.cc     |  99 +++---
 src/turbomind/models/llama/LlamaWeight.h      |  36 +-
 src/turbomind/models/llama/llama_gemm.cc      |   2 +-
 src/turbomind/models/llama/llama_kernels.h    |   2 +-
 src/turbomind/models/llama/llama_params.h     |  65 +++-
 src/turbomind/models/llama/llama_utils.cu     |  73 ++--
 src/turbomind/models/llama/mla_utils.cu       |  93 +++++
 src/turbomind/models/llama/mla_utils.h        |  57 +++
 src/turbomind/models/llama/moe_ffn_layer.cc   |  74 ++--
 src/turbomind/models/llama/moe_ffn_layer.h    |  20 +-
 .../models/llama/unified_attention_layer.cc   | 150 ++++++--
 .../models/llama/unified_attention_layer.h    |   7 +-
 src/turbomind/models/llama/unified_decoder.cc |  89 ++---
 src/turbomind/models/llama/unified_decoder.h  |  16 +-
 src/turbomind/models/llama/weight_type.h      |  56 +++
 src/turbomind/python/bind.cpp                 |  48 ++-
 .../triton_backend/llama/LlamaTritonModel.cc  |  80 +++--
 .../triton_backend/llama/LlamaTritonModel.h   |   3 -
 src/turbomind/utils/allocator.h               |   3 +-
 src/turbomind/utils/cuda_utils.h              |  19 +
 src/turbomind/utils/memory_utils.cu           | 108 +++---
 src/turbomind/utils/memory_utils.h            |  13 +-
 75 files changed, 2118 insertions(+), 861 deletions(-)
 create mode 100644 lmdeploy/turbomind/deploy/source_model/deepseek2.py
 create mode 100644 src/turbomind/kernels/attention/codegen/attention_sm80_192.cu
 create mode 100644 src/turbomind/kernels/attention/codegen/decoding_sm80_192.cu
 create mode 100644 src/turbomind/kernels/norm/CMakeLists.txt
 create mode 100644 src/turbomind/kernels/norm/rms_norm.cu
 create mode 100644 src/turbomind/kernels/norm/rms_norm.h
 create mode 100644 src/turbomind/models/llama/mla_utils.cu
 create mode 100644 src/turbomind/models/llama/mla_utils.h
 create mode 100644 src/turbomind/models/llama/weight_type.h

diff --git a/autotest/config.yaml b/autotest/config.yaml
index e31a40f0d4..88ca7c3127 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -62,6 +62,7 @@ turbomind_chat_model:
     - liuhaotian/llava-v1.6-vicuna-7b
     - deepseek-ai/deepseek-vl-1.3b-chat
     - deepseek-ai/deepseek-coder-1.3b-instruct
+    - deepseek-ai/DeepSeek-V2-Lite-Chat
     - codellama/CodeLlama-7b-Instruct-hf
     - THUDM/glm-4-9b-chat
     - openbmb/MiniCPM-Llama3-V-2_5
@@ -165,6 +166,7 @@ turbomind_quatization:
     no_awq:
         - mistralai/Mistral-7B-Instruct-v0.3
         - deepseek-ai/deepseek-coder-1.3b-instruct
+        - deepseek-ai/DeepSeek-V2-Lite-Chat
         - codellama/CodeLlama-7b-Instruct-hf
     gptq:
         - internlm/internlm2_5-7b-chat
diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index b0e513410e..1fb5fa0964 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -114,14 +114,14 @@ broadCastRequest(const std::vector<int>& v_start_ids,
         }
         else {
             // conditional case.
-            ft::deviceMalloc(&d_input_ids, size_1, false);
+            ft::deviceMalloc(&d_input_ids, size_1, nullptr, false);
             // ft::deviceMalloc(&d_input_lengths, size_2, false);
             ft::cudaH2Dcpy(d_input_ids, v_input_ids.data(), size_1);
             // ft::cudaH2Dcpy(d_input_lengths, v_input_lengths.data(), size_2);
         }
 
         if (!v_input_bad_words.empty()) {
-            ft::deviceMalloc(&d_input_bad_words, size_bad_words, false);
+            ft::deviceMalloc(&d_input_bad_words, size_bad_words, nullptr, false);
             ft::cudaH2Dcpy(d_input_bad_words, v_input_bad_words.data(), size_bad_words);
         }
         else {
diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py
index c724b085a0..e483500e96 100644
--- a/lmdeploy/turbomind/deploy/config.py
+++ b/lmdeploy/turbomind/deploy/config.py
@@ -2,6 +2,7 @@
 import inspect
 import json
 from dataclasses import asdict, fields
+from typing import List
 
 # use pydantic.dataclasses.dataclass to check data type
 from pydantic.dataclasses import dataclass
@@ -43,22 +44,33 @@ class ModelConfig:
     # of token_embedding
     embedding_size: int = 0
     num_layer: int = None
-    inter_size: int = None
+    inter_size: List[int] = None
     norm_eps: float = None
     attn_bias: int = 0
     start_id: int = None
     end_id: int = None
     size_per_head: int = 128
-    group_size: int = 0
+    group_size: int = 64
     weight_type: str = None
     session_len: int = None
     tp: int = 1
     model_format: str = 'hf'
-    expert_num: int = 0
+    expert_num: List[int] = ()
     expert_inter_size: int = 0
     experts_per_token: int = 0
-    moe_shared_gate: int = False
-    moe_norm_topk: int = False
+    moe_shared_gate: bool = False
+    norm_topk_prob: bool = False
+    routed_scale: float = 1.0
+    topk_group: int = 1
+    topk_method: str = 'greedy'
+    moe_group_num: int = 1
+    # MLA
+    q_lora_rank: int = 0
+    kv_lora_rank: int = 0
+    qk_rope_dim: int = 0
+    v_head_dim: int = 0
+    # tuning
+    tune_layer_num: int = 1
 
     def verify(self):
         invalid = {}
@@ -72,6 +84,7 @@ def verify(self):
 class AttentionConfig:
     rotary_embedding: int = 128
     rope_theta: float = 10000.0
+    softmax_scale: float = 0
     attention_factor: float = None
     max_position_embeddings: int = 0
     original_max_position_embeddings: int = 0
diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py
index 1c847ede01..77f0bc8dc8 100644
--- a/lmdeploy/turbomind/deploy/converter.py
+++ b/lmdeploy/turbomind/deploy/converter.py
@@ -241,11 +241,10 @@ def get_tm_model(model_path,
         engine_config.model_format = quant_method
         group_size = _group_size
 
-    # Compatible to awq models that are quantized by lmdeploy (<=v0.3.0)
-    if not group_size:
-        group_size = 128
-
     if engine_config.model_format in ['awq', 'gptq']:
+        # Compatible to awq models that are quantized by lmdeploy (<=v0.3.0)
+        if not group_size:
+            group_size = 128
         assert group_size == 128, \
             f'model format is "{engine_config.model_format}" ' \
             f'but group_size is {group_size}. Currently, only 128 ' \
diff --git a/lmdeploy/turbomind/deploy/loader.py b/lmdeploy/turbomind/deploy/loader.py
index e3d79b164a..94e779b6b7 100644
--- a/lmdeploy/turbomind/deploy/loader.py
+++ b/lmdeploy/turbomind/deploy/loader.py
@@ -88,6 +88,27 @@ def items(self):
                     yield (-1, {k: f.get_tensor(k) for k in misc})
         assert not params
 
+    # def items(self):
+    #     params = defaultdict(dict)
+    #     for shard in self.shards:
+    #         # with safe_open(shard, 'pt') as f:
+    #         with open(shard, 'rb') as f:
+    #             w = safetensors.torch.load(f.read())
+    #             misc = []
+    #             for k in w.keys():
+    #                 match = re.findall(self.pattern, k)
+    #                 if not match:
+    #                     misc.append(k)
+    #                 else:
+    #                     idx = int(match[0])
+    #                     param = params[idx]
+    #                     param[k] = w[k]
+    #                     if len(param) == self.item_count[idx]:
+    #                         yield (idx, params.pop(idx))
+    #             if misc:
+    #                 yield (-1, {k: w[k] for k in misc})
+    #     assert not params
+
 
 class PytorchLoader(BaseLoader):
 
diff --git a/lmdeploy/turbomind/deploy/module.py b/lmdeploy/turbomind/deploy/module.py
index 8d998abe2b..52497175ef 100644
--- a/lmdeploy/turbomind/deploy/module.py
+++ b/lmdeploy/turbomind/deploy/module.py
@@ -96,10 +96,13 @@ class Ffn(Module):
     def __init__(self, model: BaseOutputModel):
         self.model = model
         self.tp = model.tensor_para_size
+        # inter_sizes in config are padded and may be different from what's
+        # in the weights
         self.inter_size = model.model_config.inter_size
         self.group_size = max(1, model.model_config.group_size)
 
     def _export(self,
+                inter_size: int,
                 fmt: str,
                 idx: int,
                 w123,
@@ -110,11 +113,11 @@ def _export(self,
         w1, w2, w3 = map(transpose, w123)
 
         if not is_lora_a:
-            w1 = pad_out_dims(w1, self.inter_size)
-            w3 = pad_out_dims(w3, self.inter_size)
+            w1 = pad_out_dims(w1, inter_size)
+            w3 = pad_out_dims(w3, inter_size)
         if not is_lora_b:
             group_size = self.group_size if apply_gs else 1
-            w2 = pad_in_dims(w2, self.inter_size // group_size)
+            w2 = pad_in_dims(w2, inter_size // group_size)
 
         w1, w2, w3 = map(pack_fn, (w1, w2, w3))
         self.model.save_split(w1,
@@ -132,7 +135,8 @@ def _export(self,
 
     def apply(self, i: int, r: BaseReader):
         for e in get_params(r.ffn(i, None)):
-            e(partial(self._export, self._ffn), partial(r.ffn, i), i)
+            e(partial(self._export, self.inter_size[i], self._ffn),
+              partial(r.ffn, i), i)
 
 
 class MoeFfn(Ffn):
@@ -154,11 +158,13 @@ def __init__(self, model: BaseOutputModel):
         self.shared_gate = model.model_config.moe_shared_gate
 
     def apply(self, i: int, r: BaseReader):
+        if self.expert_num[i] == 0:
+            return
         for p in get_params(r.moe_ffn_expert()):
-            for e in range(self.expert_num):
+            for e in range(self.expert_num[i]):
                 fmt = self._moe_ffn_expert.replace('E', str(e))
-                p(partial(self._export, fmt), partial(r.moe_ffn_expert, e, i),
-                  i)
+                p(partial(self._export, self.inter_size, fmt),
+                  partial(r.moe_ffn_expert, e, i), i)
 
         gate = transpose(r.moe_ffn_gate(i))
         self.model.save_split(gate, self._moe_ffn_gate.format(i))
@@ -218,6 +224,62 @@ def apply(self, i: int, r: BaseReader):
             e(self._export, partial(r.attn, i), i)
 
 
+class MLA(Module):
+    """
+    requires:
+        r.mla(i, kind)
+        r.mla_norm(i)
+    """
+
+    _mla = 'layers.{0}.attention.{1}.{2}'
+
+    def __init__(self, model: BaseOutputModel):
+        self.model = model
+
+    def _export(self, idx: int, xs, kind: str, pack_fn, **kwargs):
+        if all(x is None for x in xs):
+            return
+        q_a, q_b, q, kv_a, kv_b, o = map(transpose, xs)
+
+        if q is not None:
+            q_b = q
+
+        cfg = self.model.model_config
+
+        o = o.reshape(cfg.head_num, cfg.v_head_dim, -1)
+        o = torch.nn.functional.pad(
+            o, (0, 0, 0, cfg.size_per_head - cfg.v_head_dim, 0, 0))
+        o = o.view(cfg.head_num * cfg.size_per_head, cfg.hidden_units)
+
+        if q_a is not None:
+            self.model.save_split(pack_fn(q_a),
+                                  self._mla.format(idx, 'q_a_proj', kind))
+        q_b_name = 'q_proj' if q_a is None else 'q_b_proj'
+        self.model.save_split(pack_fn(q_b),
+                              self._mla.format(idx, q_b_name, kind),
+                              split_dim=-1)
+        self.model.save_split(pack_fn(kv_a),
+                              self._mla.format(idx, 'kv_a_proj', kind))
+        self.model.save_split(pack_fn(kv_b),
+                              self._mla.format(idx, 'kv_b_proj', kind),
+                              split_dim=-1)
+        self.model.save_split(pack_fn(o),
+                              self._mla.format(idx, 'wo', kind),
+                              split_dim=0)
+
+    _layernorm = 'layers.{0}.attention.{1}_a_layernorm'
+
+    def apply(self, i: int, r: BaseReader):
+
+        for f in get_params(r.attn(i, None), bias=False):
+            f(self._export, partial(r.mla, i), i)
+
+        q, k = r.mla_norm(i)
+        if q is not None:
+            self.model.save_split(q, self._layernorm.format(i, 'q'))
+        self.model.save_split(k, self._layernorm.format(i, 'kv'))
+
+
 class Misc(Module):
     """
     requires:
@@ -258,7 +320,11 @@ class Transformer:
 
     def __init__(self, model: BaseOutputModel):
         self.model = model
-        modules = [Attn, LayerNorm]
+        modules = [LayerNorm]
+        if model.model_config.kv_lora_rank:
+            modules.append(MLA)
+        else:
+            modules.append(Attn)
         if model.model_config.inter_size:
             modules.append(Ffn)
         if model.model_config.expert_num:
diff --git a/lmdeploy/turbomind/deploy/source_model/__init__.py b/lmdeploy/turbomind/deploy/source_model/__init__.py
index de16bdc0a0..b9394b1244 100644
--- a/lmdeploy/turbomind/deploy/source_model/__init__.py
+++ b/lmdeploy/turbomind/deploy/source_model/__init__.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .baichuan import Baichuan2Model, BaichuanModel  # noqa: F401
+from .deepseek2 import DeepSeek2Model  # noqa: F401
 from .deepseek_vl import DeepSeekVLModel  # noqa: F401
 from .glm4 import Glm4Model  # noqa: F401
 from .internlm2 import InternLM2Model  # noqa: F401
diff --git a/lmdeploy/turbomind/deploy/source_model/deepseek2.py b/lmdeploy/turbomind/deploy/source_model/deepseek2.py
new file mode 100644
index 0000000000..0023f650ff
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/source_model/deepseek2.py
@@ -0,0 +1,134 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+from .base import INPUT_MODELS
+from .llama import LlamaModel, LlamaReader
+
+
+class DeepSeek2Reader(LlamaReader):
+
+    def moe_ffn_gate(self, i):
+        return self.params.get(f'model.layers.{i}.mlp.gate.weight')
+
+    def moe_ffn_expert(self, e=None, i=None, kind=None):
+        if not kind:
+            return self.filter(r'experts')
+        result = []
+        for key in ['gate', 'down', 'up']:
+            name = f'model.layers.{i}.mlp.experts.{e}.{key}_proj.{kind}'
+            tensor = self.params.get(name)
+            tensor = self.transform(tensor, kind)
+            result.append(tensor)
+        return (*result, )
+
+    def _ffn(self, i: int, kind: str):
+        """Get ffn kind for layer i."""
+        if not kind:
+            return self.filter(r'mlp' if i == 0 else r'shared_expert\.')
+        result = []
+        for key in ['gate', 'down', 'up']:
+            name = f'model.layers.{i}.mlp.shared_experts.{key}_proj.{kind}'
+            if i == 0:
+                name = name.replace('shared_experts.', '')
+            tensor = self.params.get(name)
+            tensor = self.transform(tensor, kind)
+            result.append(tensor)
+        return (*result, )
+
+    def mla(self, i: int, kind: str):
+        if not kind:
+            return self.filter(r'self_attn.*proj')
+        result = []
+        for key in [
+                'q_a_proj', 'q_b_proj', 'q_proj', 'kv_a_proj_with_mqa',
+                'kv_b_proj', 'o_proj'
+        ]:
+            tensor = self.params.get(
+                f'{self.attn_layer_prefix}.{i}.self_attn.{key}.{kind}')
+            tensor = self.transform(tensor, kind)
+            result.append(tensor)
+        return (*result, )
+
+    def mla_norm(self, i: int):
+        result = []
+        for k in ['q', 'kv']:
+            name = f'{self.attn_layer_prefix}.{i}.self_attn.{k}_a_layernorm.weight'  # noqa: E501
+            result.append(self.params.get(name))
+        return (*result, )
+
+
+def get_yarn_params(rope_scaling: dict):
+
+    scaling_factor = float(rope_scaling['factor'])
+    mscale = rope_scaling['mscale']
+    mscale_all_dim = rope_scaling['mscale_all_dim']
+
+    def yarn_get_mscale(scale=1, mscale=1):
+        if scale <= 1:
+            return 1.0
+        return 0.1 * mscale * math.log(scale) + 1.0
+
+    _mscale = float(
+        yarn_get_mscale(scaling_factor, mscale) /
+        yarn_get_mscale(scaling_factor, mscale_all_dim))
+
+    softmax_scale = 0
+    if mscale_all_dim:
+        scale = yarn_get_mscale(scaling_factor, mscale_all_dim)
+        softmax_scale = scale * scale
+
+    return _mscale, softmax_scale
+
+
+@INPUT_MODELS.register_module(name='deepseek2')
+class DeepSeek2Model(LlamaModel):
+
+    Reader = DeepSeek2Reader
+
+    def tokenizer_info(self):
+        n_words = self.model_config['vocab_size']
+        bos_id = self.model_config['bos_token_id']
+        eos_id = self.model_config['eos_token_id']
+        return n_words, bos_id, eos_id
+
+    def model_info(self):
+        cfg = self.model_config
+        info = super().model_info()
+        qk_nope_dim = cfg['qk_nope_head_dim']
+        qk_rope_dim = cfg['qk_rope_head_dim']
+        num_layer = cfg['num_hidden_layers']
+        expert_num = cfg['n_routed_experts']
+        expert_num = [expert_num] * num_layer
+        expert_num[0] = 0
+        n_shared_experts = cfg['n_shared_experts']
+        expert_inter_size = cfg['moe_intermediate_size']
+        experts_per_token = cfg['num_experts_per_tok']
+        inter_size = [n_shared_experts * expert_inter_size] * num_layer
+        inter_size[0] = cfg['intermediate_size']
+        norm_topk_prob = cfg['norm_topk_prob']
+        size_per_head = qk_rope_dim + qk_nope_dim
+        info.update(kv_lora_rank=cfg['kv_lora_rank'],
+                    q_lora_rank=cfg['q_lora_rank'] or 0,
+                    qk_rope_dim=qk_rope_dim,
+                    v_head_dim=cfg['v_head_dim'],
+                    size_per_head=size_per_head,
+                    rotary_embedding=qk_rope_dim,
+                    expert_num=expert_num,
+                    expert_inter_size=expert_inter_size,
+                    experts_per_token=experts_per_token,
+                    inter_size=inter_size,
+                    norm_topk_prob=norm_topk_prob,
+                    routed_scale=cfg['routed_scaling_factor'],
+                    topk_method=cfg['topk_method'],
+                    topk_group=cfg['topk_group'],
+                    moe_group_num=cfg['n_group'],
+                    tune_layer_num=2)
+        rope_scaling = cfg.get('rope_scaling')
+        if rope_scaling and rope_scaling['type'] == 'yarn':
+            attention_factor, softmax_scale = get_yarn_params(rope_scaling)
+            softmax_scale *= size_per_head**(-0.5)
+            info.update(max_position_embeddings=rope_scaling[
+                'original_max_position_embeddings'],
+                        attention_factor=attention_factor,
+                        softmax_scale=softmax_scale)
+        return info
diff --git a/lmdeploy/turbomind/deploy/source_model/mixtral.py b/lmdeploy/turbomind/deploy/source_model/mixtral.py
index ff9df2d409..6ac22a658e 100644
--- a/lmdeploy/turbomind/deploy/source_model/mixtral.py
+++ b/lmdeploy/turbomind/deploy/source_model/mixtral.py
@@ -33,6 +33,6 @@ def model_info(self):
         info['expert_num'] = cfg['num_local_experts']
         info['expert_inter_size'] = cfg['intermediate_size']
         info['experts_per_token'] = cfg['num_experts_per_tok']
-        info['moe_norm_topk'] = True
+        info['norm_topk_prob'] = True
         info['inter_size'] = 0
         return info
diff --git a/lmdeploy/turbomind/deploy/source_model/qwen.py b/lmdeploy/turbomind/deploy/source_model/qwen.py
index 772bd03037..637983e8ce 100644
--- a/lmdeploy/turbomind/deploy/source_model/qwen.py
+++ b/lmdeploy/turbomind/deploy/source_model/qwen.py
@@ -178,6 +178,6 @@ def model_info(self):
         info['experts_per_token'] = cfg['num_experts_per_tok']
         info['inter_size'] = cfg['shared_expert_intermediate_size']
         info['moe_shared_gate'] = True
-        info['moe_norm_topk_prob'] = cfg['norm_topk_prob']
+        info['norm_topk_prob'] = cfg['norm_topk_prob']
         info['attn_bias'] = 1
         return info
diff --git a/lmdeploy/turbomind/deploy/target_model/base.py b/lmdeploy/turbomind/deploy/target_model/base.py
index 09699ade09..f2c981bb24 100644
--- a/lmdeploy/turbomind/deploy/target_model/base.py
+++ b/lmdeploy/turbomind/deploy/target_model/base.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os.path as osp
 from abc import ABC
+from collections.abc import Sequence
 
 import torch
 import tqdm
@@ -65,13 +66,14 @@ def __init__(self,
         # get `model_info` and `tokenizer_info` at first, which
         # will be updated to `self.model_config` and `self.attention_config`
         self.input_model_info = self.input_model.model_info()
+        self.input_model_info = self.single_to_list(
+            self.input_model_info, keys=['inter_size', 'expert_num'])
         self.input_model_tokenizer_info = self.input_model.tokenizer_info()
         self.permute_qk = self.input_model_info.get('permute_qk', True)
-
         self.update_model_config()
-        self.model_config.inter_size = _pad_inter_size(
-            self.model_config.inter_size, self.model_config.group_size,
-            self.tensor_para_size)
+        for i, v in enumerate(self.model_config.inter_size):
+            self.model_config.inter_size[i] = _pad_inter_size(
+                v, self.model_config.group_size, self.tensor_para_size)
         if self.model_config.expert_num:
             self.model_config.expert_inter_size = _pad_inter_size(
                 self.model_config.expert_inter_size,
@@ -79,11 +81,21 @@ def __init__(self,
         self.model_config.verify()
         assert self.model_config.kv_head_num % self.tensor_para_size == 0
 
+        # print(self.model_config)
+
         self.update_attention_config()
         self.update_lora_config()
         # ! Dependency on `self`
         self.model = model_cls(self)
 
+    def single_to_list(self, config: dict, keys):
+        num_layer = int(config['num_layer'])
+        for k in keys:
+            v = config.get(k, None)
+            if v is not None and not isinstance(v, Sequence):
+                config[k] = [v] * num_layer
+        return config
+
     def update_model_config(self):
         """Update `self.model_config` according to the input_model's
         `tokenizer_info` and `model_info`"""
diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py
index e66da22df0..11e99edfa0 100644
--- a/lmdeploy/turbomind/supported_models.py
+++ b/lmdeploy/turbomind/supported_models.py
@@ -33,6 +33,7 @@
     InternVLChatModel='internvl',
     # deepseek-vl
     MultiModalityCausalLM='deepseekvl',
+    DeepseekV2ForCausalLM='deepseek2',
     # MiniCPMV
     MiniCPMV='minicpmv',
     # mini gemini
diff --git a/src/turbomind/kernels/CMakeLists.txt b/src/turbomind/kernels/CMakeLists.txt
index febb8692dd..40a48402af 100644
--- a/src/turbomind/kernels/CMakeLists.txt
+++ b/src/turbomind/kernels/CMakeLists.txt
@@ -68,3 +68,4 @@ endif ()
 
 add_subdirectory(attention)
 add_subdirectory(gemm)
+add_subdirectory(norm)
diff --git a/src/turbomind/kernels/attention/CMakeLists.txt b/src/turbomind/kernels/attention/CMakeLists.txt
index af9d47e0e6..32de38981a 100644
--- a/src/turbomind/kernels/attention/CMakeLists.txt
+++ b/src/turbomind/kernels/attention/CMakeLists.txt
@@ -38,6 +38,8 @@ add_library(attention STATIC
             codegen/decoding_sm80_64_f16_f16.cu
             codegen/decoding_sm80_64_f16_u4.cu
             codegen/decoding_sm80_64_f16_u8.cu
+            codegen/attention_sm80_192.cu
+            codegen/decoding_sm80_192.cu
             )
 set_property(TARGET attention PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_property(TARGET attention PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
diff --git a/src/turbomind/kernels/attention/attention.cu b/src/turbomind/kernels/attention/attention.cu
index 3f557234bc..e7642584c2 100644
--- a/src/turbomind/kernels/attention/attention.cu
+++ b/src/turbomind/kernels/attention/attention.cu
@@ -46,6 +46,12 @@ void dispatchAttention(const AttentionParams<T>& params)
     else if (params.size_per_head == 128) {
         return dispatch(std::integral_constant<int, 128>{});
     }
+
+    if (params.size_per_head == 192) {
+        using Config = AttentionConfig<arch::Sm80, T, 192, CacheType::kLinear>;
+        return invokeAttention<typename Config::Kernel>(params);
+    }
+
     FT_CHECK(0);
 }
 
diff --git a/src/turbomind/kernels/attention/codegen/attention_sm80_192.cu b/src/turbomind/kernels/attention/codegen/attention_sm80_192.cu
new file mode 100644
index 0000000000..ceeafa7a6d
--- /dev/null
+++ b/src/turbomind/kernels/attention/codegen/attention_sm80_192.cu
@@ -0,0 +1,16 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "../attention_config.h"
+#include "../attention_template.h"
+
+namespace turbomind {
+
+using namespace attention;
+
+template void invokeAttention<typename AttentionConfig<arch::Sm80, nv_bfloat16, 192, CacheType::kLinear>::Kernel>(
+    const AttentionParams<nv_bfloat16>& params);
+
+template void invokeAttention<typename AttentionConfig<arch::Sm80, half, 192, CacheType::kLinear>::Kernel>(
+    const AttentionParams<half>& params);
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/attention/codegen/decoding_sm80_192.cu b/src/turbomind/kernels/attention/codegen/decoding_sm80_192.cu
new file mode 100644
index 0000000000..214e6748d9
--- /dev/null
+++ b/src/turbomind/kernels/attention/codegen/decoding_sm80_192.cu
@@ -0,0 +1,20 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "../decoding_config.h"
+#include "../decoding_template.h"
+
+namespace turbomind {
+
+using namespace attention;
+
+template bool
+invokeDecoding<Decoding<arch::Sm80, nv_bfloat16, nv_bfloat16, 1, 192>>(const AttentionParams<nv_bfloat16>& params);
+
+template bool invokeDecoding<Decoding<arch::Sm80, half, half, 1, 192>>(const AttentionParams<half>& params);
+
+template bool
+invokeDecoding<Decoding<arch::Sm80, nv_bfloat16, uint8_t, 1, 192>>(const AttentionParams<nv_bfloat16>& params);
+
+template bool invokeDecoding<Decoding<arch::Sm80, half, uint8_t, 1, 192>>(const AttentionParams<half>& params);
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/attention/decoding.cu b/src/turbomind/kernels/attention/decoding.cu
index 1b04b7d4eb..67bd81e45b 100644
--- a/src/turbomind/kernels/attention/decoding.cu
+++ b/src/turbomind/kernels/attention/decoding.cu
@@ -2,8 +2,8 @@
 
 #include "decoding.h"
 #include "decoding_config.h"
+#include "src/turbomind/kernels/attention/arch.h"
 #include "src/turbomind/models/llama/llama_utils.h"
-// #include "src/turbomind/utils/dispatch.h"
 #include <type_traits>
 #include <utility>
 
@@ -113,6 +113,21 @@ void dispatchDecoding(const AttentionParams<T>& params)
         return false;
     };
 
+    if (params.size_per_head == 192) {
+
+        if (is_kv_int8) {
+            invokeDecoding<Decoding<arch::Sm80, T, uint8_t, 1, 192>>(params);
+        }
+        else if (is_kv_int4) {
+            FT_CHECK_WITH_INFO(!is_kv_int4, "not implemented");
+            // invokeDecoding<Decoding<arch::Sm80, T, uint4_t, 1, 192>>(params);
+        }
+        else {
+            invokeDecoding<Decoding<arch::Sm80, T, T, 1, 192>>(params);
+        }
+        return;
+    }
+
     auto success = dispatch();
 
     FT_CHECK(success);
diff --git a/src/turbomind/kernels/attention/decoding_config.h b/src/turbomind/kernels/attention/decoding_config.h
index 7dcb119cfd..dfd5e07835 100644
--- a/src/turbomind/kernels/attention/decoding_config.h
+++ b/src/turbomind/kernels/attention/decoding_config.h
@@ -40,7 +40,7 @@ struct DecodingConfig<arch::Sm80, T, T, Qh_, HeadDim, std::enable_if_t<(Qh_ > 2)
 };
 
 template<class T, int Qh_, int HeadDim>
-struct DecodingConfig<arch::Sm80, T, uint8_t, Qh_, HeadDim> {
+struct DecodingConfig<arch::Sm80, T, uint8_t, Qh_, HeadDim, std::enable_if_t<(HeadDim != 192)>> {
     static constexpr int Qh = (Qh_ + 7) / 8 * 8;
     using Attention         = Impl<MMA_81616, T, uint8_t, Qh, 1, 64, Qh, 1, 16, HeadDim, 5>;
     using CacheIter         = GetBlockIterFactory<T, uint8_t, 64, HeadDim>;
@@ -76,4 +76,14 @@ struct DecodingConfig<arch::Sm70, T, Tkv, Qh, HeadDim> {
     using Kernel = AttentionUniversal<arch::Sm70, Mainloop<arch::Sm70, Attention>, CacheIter, DecodingCtaMap>;
 };
 
+template<class T>
+struct DecodingConfig<arch::Sm80, T, uint8_t, 1, 192> {
+    static constexpr int Qh      = 1;
+    static constexpr int HeadDim = 192;
+
+    using Attention = Impl<MMA_SIMT, T, uint8_t, Qh, 1, 64, Qh, 1, 16, HeadDim, 3>;
+    using CacheIter = GetBlockIterFactory<T, uint8_t, 64, HeadDim>;
+    using Kernel    = AttentionUniversal<arch::Sm80, Mainloop<Sm80_CpAsync<3>, Attention>, CacheIter, DecodingCtaMap>;
+};
+
 }  // namespace turbomind::attention
diff --git a/src/turbomind/kernels/attention/impl_16816.h b/src/turbomind/kernels/attention/impl_16816.h
index 6e8f37f4d4..07c7dcb12b 100644
--- a/src/turbomind/kernels/attention/impl_16816.h
+++ b/src/turbomind/kernels/attention/impl_16816.h
@@ -63,26 +63,28 @@ struct Impl<MMA_16816, T_, T_, CTA_H_, CTA_Q_, CTA_S_, WARP_H, WARP_Q, WARP_S, H
 
     static_assert(sizeof(FragS) / 2 == sizeof(FragP));
 
-    using SmemLayoutQ = std::conditional_t<HeadDim == 128,
+    using SmemLayoutQ = std::conditional_t<HeadDim % 128 == 0,
                                            SmemLayoutV2<CTA_Q * CTA_H, HeadDim, 64, 128, Swizzle<3, 3, 4>>,
                                            SmemLayoutV2<CTA_Q * CTA_H, HeadDim, 64, 64, Swizzle<3, 3, 3>>>;
-    using SmemLayoutK = std::conditional_t<HeadDim == 128,
+    using SmemLayoutK = std::conditional_t<HeadDim % 128 == 0,
                                            SmemLayoutV2<CTA_S, HeadDim, 16, 128, Swizzle<3, 3, 4>>,
                                            SmemLayoutV2<CTA_S, HeadDim, 16, 64, Swizzle<3, 3, 3>>>;
-    using SmemLayoutV = std::conditional_t<HeadDim == 128,
+    using SmemLayoutV = std::conditional_t<HeadDim % 128 == 0,
                                            SmemLayoutV2<CTA_S, HeadDim, 16, 128, Swizzle<3, 3, 4>>,
                                            SmemLayoutV2<CTA_S, HeadDim, 16, 64, Swizzle<3, 3, 3>>>;
 
     using SmemLayoutKVp = void;
 
+    static constexpr bool kUseSmemQ = false;
+    static constexpr bool kUseSmemP = false;
+
+    static_assert(!kUseSmemQ, "current smemQ impl yields inconsistent outputs");
+
     union SharedStorage {
         __align__(16) T KV[Stages * (SmemLayoutK::kSize + SmemLayoutV::kSize) / 2];
         __align__(16) T Q[SmemLayoutQ::kSize];
     };
 
-    static constexpr bool kUseSmemQ = false;
-    static constexpr bool kUseSmemP = false;
-
     using ThreadMapQ  = RakedThreadMap<HeadDim, CTA_Q * CTA_H, 8, kWarpCount>;
     using ThreadMapKV = RakedThreadMap<HeadDim, CTA_S, 8, kWarpCount>;
 
@@ -109,22 +111,24 @@ struct Impl<MMA_16816, T_, T_, CTA_H_, CTA_Q_, CTA_S_, WARP_H, WARP_Q, WARP_S, H
         const int warp_id = threadIdx.x / WARP_SIZE;
         const int lane_id = threadIdx.x % WARP_SIZE;
 
-        __syncwarp();
+        if constexpr (!kUseSmemQ) {
+            __syncwarp();
 
-        SmemAccessor<T, SmemLayoutQ> sQ{smem_Q};
+            SmemAccessor<T, SmemLayoutQ> sQ{smem_Q};
 
-        // Load from shared memory using LDSM, rearrange to m16n8k16 atom layout
-        PRAGMA_UNROLL
-        for (int m = 0; m < K_M; ++m) {
+            // Load from shared memory using LDSM, rearrange to m16n8k16 atom layout
             PRAGMA_UNROLL
-            for (int k = 0; k < K_K; ++k) {
-                const int qi = lane_id % 16 * 1 + m * 16 + warp_id * WARP_Q;
-                const int di = lane_id / 16 * 8 + k * 16;
-                ldsm_x4((Array<uint32_t, 4>&)frag_Q[k][m], cast_smem_ptr_to_uint(&sQ(qi, di)));
+            for (int m = 0; m < K_M; ++m) {
+                PRAGMA_UNROLL
+                for (int k = 0; k < K_K; ++k) {
+                    const int qi = lane_id % 16 * 1 + m * 16 + warp_id * WARP_Q;
+                    const int di = lane_id / 16 * 8 + k * 16;
+                    ldsm_x4((Array<uint32_t, 4>&)frag_Q[k][m], cast_smem_ptr_to_uint(&sQ(qi, di)));
+                }
             }
         }
 
-        if constexpr (kUseSmemQ) {
+        if constexpr (0) {
             __syncthreads();
 
             // Rearrange Q in smem so that swizzling is not needed for later LDSMs
@@ -142,20 +146,25 @@ struct Impl<MMA_16816, T_, T_, CTA_H_, CTA_Q_, CTA_S_, WARP_H, WARP_Q, WARP_S, H
 
     struct StateQK {
         SmemAccessor<T, SmemLayoutK> smem_K;
+        T*                           smem_Q;
 
         FragQ frag_Q;
         FragK frag_K;
 
         __device__ StateQK(SharedStorage& storage, FragQ frag_Q_): smem_K{storage.KV}
         {
-            static_assert(!kUseSmemQ, "not implemented");
-            PRAGMA_UNROLL
-            for (int k = 0; k < K_K; ++k) {
+            if constexpr (!kUseSmemQ) {
                 PRAGMA_UNROLL
-                for (int m = 0; m < K_M; ++m) {
-                    frag_Q[k][m] = frag_Q_[k][m];
+                for (int k = 0; k < K_K; ++k) {
+                    PRAGMA_UNROLL
+                    for (int m = 0; m < K_M; ++m) {
+                        frag_Q[k][m] = frag_Q_[k][m];
+                    }
                 }
             }
+            else {
+                smem_Q = storage.Q;
+            }
         }
 
         __device__ void Load(int k, int pipe_iter)
@@ -166,6 +175,16 @@ struct Impl<MMA_16816, T_, T_, CTA_H_, CTA_Q_, CTA_S_, WARP_H, WARP_Q, WARP_S, H
             const int offset_s      = group_lane_id % 8 + group_id * 8;
             const int offset_c      = group_lane_id / 8 * 8;
             const int offset        = pipe_iter * SmemLayoutK::kSize;
+            if constexpr (kUseSmemQ) {
+                const int                    warp_id = threadIdx.x / WARP_SIZE;
+                SmemAccessor<T, SmemLayoutQ> sQ{smem_Q};
+                PRAGMA_UNROLL
+                for (int m = 0; m < K_M; ++m) {
+                    const int qi = lane_id % 16 * 1 + m * 16 + warp_id * WARP_Q;
+                    const int di = lane_id / 16 * 8 + k * 16;
+                    ldsm_x4((Array<uint32_t, 4>&)frag_Q[k][m], cast_smem_ptr_to_uint(&sQ(qi, di)));
+                }
+            }
             PRAGMA_UNROLL
             for (int n = 0; n < K_N; n += 2) {  // Load (s16,d16) tiles
                 const int s = n * 8 + offset_s;
diff --git a/src/turbomind/kernels/attention/impl_81616.h b/src/turbomind/kernels/attention/impl_81616.h
index 3b90bcdf57..f865f1bc3a 100644
--- a/src/turbomind/kernels/attention/impl_81616.h
+++ b/src/turbomind/kernels/attention/impl_81616.h
@@ -104,7 +104,7 @@ struct Impl<MMA_81616, T_, Tkv_, CTA_H_, CTA_Q_, CTA_S_, WARP_H_, WARP_Q, WARP_S
     }
     static constexpr auto _SmemLayoutKV(std::integral_constant<int, 4>)
     {
-        return std::conditional_t<HeadDim == 128,
+        return std::conditional_t<HeadDim % 128 == 0,
                                   SmemLayoutV2<CTA_S, HeadDim, 32, 128, Swizzle<2, 5, 3>>,
                                   SmemLayoutV2<CTA_S, HeadDim, 32, 64, Swizzle<3, 4, 3>>>{};
     }
diff --git a/src/turbomind/kernels/attention/impl_simt.h b/src/turbomind/kernels/attention/impl_simt.h
index a886185a44..444b67e2c8 100644
--- a/src/turbomind/kernels/attention/impl_simt.h
+++ b/src/turbomind/kernels/attention/impl_simt.h
@@ -2,12 +2,16 @@
 
 #pragma once
 
-#include "src/turbomind/kernels/attention/impl.h"
+#include <limits>
+#include <numeric>
+#include <type_traits>
+
 #include "src/turbomind/kernels/core/array_ops.h"
 #include "src/turbomind/kernels/core/layout.h"
 #include "src/turbomind/kernels/core/thread_map.h"
-#include <limits>
-#include <type_traits>
+
+#include "src/turbomind/kernels/attention/impl.h"
+#include "src/turbomind/kernels/attention/quantization.h"
 
 namespace turbomind::attention {
 
@@ -51,7 +55,7 @@ struct Impl<MMA_SIMT, T_, Tkv_, CTA_H_, CTA_Q_, CTA_S_, WARP_H_, WARP_Q, WARP_S,
     static constexpr int T_D = 8;                // warp thread C
     static constexpr int T_S = WARP_SIZE / T_D;  // warp thread S
 
-    // warp footprint
+    // warp footprint (1x4x64)
     static constexpr int OP_H = 1;
     static constexpr int OP_S = T_S;
     static constexpr int OP_D = VEC * T_D;
@@ -76,7 +80,7 @@ struct Impl<MMA_SIMT, T_, Tkv_, CTA_H_, CTA_Q_, CTA_S_, WARP_H_, WARP_Q, WARP_S,
         static constexpr int S_S_thr = 1;
         static constexpr int S_D     = VEC;
         static constexpr int S_S     = T_S;
-        static constexpr int LDS     = K_K;
+        static constexpr int LDS     = std::gcd(16 / sizeof(Array<Tkv, VEC>), K_K);
     };
 
     struct LinearD {
diff --git a/src/turbomind/kernels/attention/kv_cache_utils_v2.cu b/src/turbomind/kernels/attention/kv_cache_utils_v2.cu
index 20bb00fde8..f2e2faef91 100644
--- a/src/turbomind/kernels/attention/kv_cache_utils_v2.cu
+++ b/src/turbomind/kernels/attention/kv_cache_utils_v2.cu
@@ -277,11 +277,14 @@ void invokeProcessKV_v2(char**       blocks,
     };
 
     auto dispatch = [&](auto tkv) {
-        if (head_dim == 128) {
+        if (head_dim == 64) {
+            return invoke(tkv, std::integral_constant<int, 64>{});
+        }
+        else if (head_dim == 128) {
             return invoke(tkv, std::integral_constant<int, 128>{});
         }
-        else if (head_dim == 64) {
-            return invoke(tkv, std::integral_constant<int, 64>{});
+        else if (head_dim == 192) {
+            return invoke(tkv, std::integral_constant<int, 192>{});
         }
         FT_CHECK(0);
     };
@@ -545,6 +548,9 @@ void invokeFlattenKV_v2(T*           k,
         else if (head_dim == 128) {
             return invoke(tkv, std::integral_constant<int, 128>{});
         }
+        else if (head_dim == 192) {
+            return invoke(tkv, std::integral_constant<int, 192>{});
+        }
         FT_CHECK(0);
     };
 
diff --git a/src/turbomind/kernels/attention/mainloop_sm80.h b/src/turbomind/kernels/attention/mainloop_sm80.h
index bf0fc1d32a..4435400b70 100644
--- a/src/turbomind/kernels/attention/mainloop_sm80.h
+++ b/src/turbomind/kernels/attention/mainloop_sm80.h
@@ -52,7 +52,7 @@ struct Mainloop<Sm80_CpAsync<Stages>, Impl_> {
     template<class... Args>
     __device__ void operator()(Args&&... args)
     {
-        Run(Sm80_CpAsync<Stages>{}, ((Args &&) args)...);
+        Run(Sm80_CpAsync<Stages>{}, std::integral_constant<int, Impl::kHeadDim>{}, ((Args &&) args)...);
     }
 
     template<int Idx, class A, class B>
@@ -81,8 +81,9 @@ struct Mainloop<Sm80_CpAsync<Stages>, Impl_> {
         }
     }
 
-    template<class CacheIter, class StoreS, int Stages_>
+    template<int head_dim, class CacheIter, class StoreS, int Stages_>
     __device__ void Run(Sm80_CpAsync<Stages_>,
+                        std::integral_constant<int, head_dim>,
                         FragQ&         frag_Q,
                         CacheIter&     cache_iter,
                         FragO&         frag_O,
@@ -199,9 +200,10 @@ struct Mainloop<Sm80_CpAsync<Stages>, Impl_> {
         __pipeline_wait_prior(0);
     }
 
-#if 0
+    // #if 1
     template<class CacheIter, class StoreS>
     __device__ void Run(Sm80_CpAsync<2>,
+                        std::integral_constant<int, 192>,
                         FragQ&         frag_Q,
                         CacheIter&     cache_iter,
                         FragO&         frag_O,
@@ -234,7 +236,7 @@ struct Mainloop<Sm80_CpAsync<Stages>, Impl_> {
         Wait();
         state_QK.Load(0, 0);
 
-        constexpr auto _ = [](int){};
+        constexpr auto _ = [](int) {};
 
         auto loop = [&](auto is_residue, auto is_mask) {
             const int offset_K = tile_iter * CTA_S;
@@ -292,14 +294,15 @@ struct Mainloop<Sm80_CpAsync<Stages>, Impl_> {
         __pipeline_wait_prior(0);
     }
 
-#elif 1
+    // #elif 1
     // Load      : K0,K1 | V0,K2,V1,K3 ...
     // Compute   :    K0 | K1,V0,K2,V1 ...
     // - more register consumption
     // - more interleaved HMMA and FMA
     // - slight performance gain
-    template<class CacheIter, class StoreS>
+    template<int head_dim, class CacheIter, class StoreS>
     __device__ void Run(Sm80_CpAsync<2>,
+                        std::integral_constant<int, head_dim>,
                         FragQ&         frag_Q,
                         CacheIter&     cache_iter_,
                         FragO&         frag_O,
@@ -407,7 +410,7 @@ struct Mainloop<Sm80_CpAsync<Stages>, Impl_> {
         __pipeline_commit();
         __pipeline_wait_prior(0);
     }
-#endif
+    // #endif
 
     __device__ void Wait()
     {
diff --git a/src/turbomind/kernels/attention/reduce.cu b/src/turbomind/kernels/attention/reduce.cu
index 12f6aff38b..c654f40d05 100644
--- a/src/turbomind/kernels/attention/reduce.cu
+++ b/src/turbomind/kernels/attention/reduce.cu
@@ -66,12 +66,14 @@ void invokeReduce(T*           out,
                                     float        exp_scale,                                                            \
                                     cudaStream_t stream);
 
-INSTANTIATE_invokeReduce(128, half);
 INSTANTIATE_invokeReduce(64, half);
+INSTANTIATE_invokeReduce(128, half);
+INSTANTIATE_invokeReduce(192, half);
 
 #if ENABLE_BF16
+INSTANTIATE_invokeReduce(64, nv_bfloat16);
 INSTANTIATE_invokeReduce(128, nv_bfloat16);
-INSTANTIATE_invokeReduce(64, nv_bfloat16)
+INSTANTIATE_invokeReduce(192, nv_bfloat16);
 #endif
 
 }  // namespace turbomind::attention
diff --git a/src/turbomind/kernels/attention/reduce_kernel.h b/src/turbomind/kernels/attention/reduce_kernel.h
index 88a3ab3af8..b4c9064cfe 100644
--- a/src/turbomind/kernels/attention/reduce_kernel.h
+++ b/src/turbomind/kernels/attention/reduce_kernel.h
@@ -128,9 +128,12 @@ struct Reduce {
 
         __syncthreads();
 
-        constexpr int kVecSize = HeadDim / WARP_SIZE;
+        // HeadDim / WARP_SIZE
+        // 128     -> 4
+        // 64, 192 -> 2
+        constexpr int kVecSize = HeadDim % 128 == 0 ? 4 : 2;
 
-        using Map = RakedThreadMap<HeadDim, WarpCnt * CTA_H, kVecSize, WarpCnt>;
+        using Map = RakedThreadMap<HeadDim, WarpCnt * CTA_H, kVecSize, WarpCnt, WARP_SIZE>;
 
         static_assert(Map::kIterS == CTA_H);
 
diff --git a/src/turbomind/kernels/attention/rotary_embedding.h b/src/turbomind/kernels/attention/rotary_embedding.h
index 8e09da22cd..db836ed184 100644
--- a/src/turbomind/kernels/attention/rotary_embedding.h
+++ b/src/turbomind/kernels/attention/rotary_embedding.h
@@ -131,6 +131,7 @@ struct FastRoPE {
     template<typename T>
     __device__ void apply(Array<T, N>& x, float timestep)
     {
+#if 0
         PRAGMA_UNROLL
         for (int i = 0; i < N; i += 2) {
             float c, s;
@@ -144,6 +145,22 @@ struct FastRoPE {
                 x[i + 1] = (T)tmp1;
             }
         }
+#else
+        // Most models apply rotary embedding in half precision
+        PRAGMA_UNROLL
+        for (int i = 0; i < N; i += 2) {
+            float c, s;
+            sincosf(timestep * inv_freq_[i / 2], &s, &c);
+            s *= attention_scaling_;
+            c *= attention_scaling_;
+            T tmp0 = (T)c * x[i] - (T)s * x[i + 1];
+            T tmp1 = (T)c * x[i + 1] + (T)s * x[i];
+            if (is_valid_) {
+                x[i]     = tmp0;
+                x[i + 1] = tmp1;
+            }
+        }
+#endif
     }
 };
 
diff --git a/src/turbomind/kernels/attention/test_attention.cu b/src/turbomind/kernels/attention/test_attention.cu
index c6d7b40637..804d4815dc 100644
--- a/src/turbomind/kernels/attention/test_attention.cu
+++ b/src/turbomind/kernels/attention/test_attention.cu
@@ -218,14 +218,14 @@ void TestBlocks(const thrust::universal_vector<T>& k_cache,        // [B, H, S,
 
 #define KV_INT4 0
 
-#define DECODING 1
+#define DECODING 0
 
 template<class T>
 int test_attention()
 {
     AttentionParams<T> params{};
 
-    constexpr size_t kHeadDim = 128;
+    constexpr size_t kHeadDim = 192;
 
 #if DECODING
     // constexpr size_t kHeadNum   = 32;
@@ -239,11 +239,11 @@ int test_attention()
     // constexpr size_t kSequenceLen = 511;
     // constexpr size_t kSequenceLen = 2047;
     // constexpr size_t kSequenceLen = 4095;
-    // constexpr size_t kSequenceLen = 8191;
+    constexpr size_t kSequenceLen = 8191;
     // constexpr size_t kSequenceLen = 32767;
     // constexpr size_t kSequenceLen = 65535;
     // constexpr size_t kSequenceLen = 131071;
-    constexpr size_t kSequenceLen = 200000;
+    // constexpr size_t kSequenceLen = 200000;
     // constexpr size_t kSequenceLen = 262143;
     // constexpr size_t kSequenceLen = (1 << 20) - 1;  // 1M
     // constexpr size_t kSequenceLen = (1 << 22) - 1;  // 4M
@@ -451,6 +451,10 @@ int test_attention()
     params.qk = qk_buf.data().get();
     params.pr = pr_buf.data().get();
 
+    params.attention_scaling          = 1.f;
+    params.llama3_inv_scaling_factor  = 0;
+    params.yarn_ramp_inv_factor_div_2 = 0;
+
     Reference<T> reference(kDump ? Reference<T>::kUNFUSED : Reference<T>::kFLASH_ATTENTION, {});
     // Reference<T> reference(Reference<T>::kUNFUSED, {});
     reference.Reshape(kInputLen, kContextLen, kHeadNum, kHeadDim, KvHeadNum, kBatchSize);
diff --git a/src/turbomind/kernels/core/array_ops.h b/src/turbomind/kernels/core/array_ops.h
index 6b639abc83..ec6e7fb4ed 100644
--- a/src/turbomind/kernels/core/array_ops.h
+++ b/src/turbomind/kernels/core/array_ops.h
@@ -172,7 +172,7 @@ inline __device__ void copy(const Array<T, N> (&src)[M], Array<T, N> (&dst)[M])
 }
 
 template<typename T, int N>
-inline __device__ void Store(T* __restrict__ dst, const Array<T, N>& src)
+inline __device__ void Store(T* dst, const Array<T, N>& src)
 {
     if constexpr (sizeof(Array<T, N>) == sizeof(uint4)) {
         *(uint4*)dst = (const uint4&)src;
diff --git a/src/turbomind/kernels/core/math.h b/src/turbomind/kernels/core/math.h
index a708a34985..054269c27f 100644
--- a/src/turbomind/kernels/core/math.h
+++ b/src/turbomind/kernels/core/math.h
@@ -5,6 +5,7 @@
 #include "src/turbomind/kernels/core/common.h"
 #include <cassert>
 #include <cstdint>
+#include <type_traits>
 
 namespace turbomind {
 
@@ -41,6 +42,13 @@ TM_HOST_DEVICE constexpr T log2(T x)
 // static_assert(log2(32) == 5);
 // static_assert(log2(1) == 0);
 
+template<class T>
+TM_HOST_DEVICE constexpr T lowbit(T x)
+{
+    const std::make_signed_t<T> s = x;
+    return static_cast<T>(s & -s);
+}
+
 // https://arxiv.org/abs/1902.01961
 template<class T>
 struct FastDivMod {
diff --git a/src/turbomind/kernels/core/thread_map.h b/src/turbomind/kernels/core/thread_map.h
index 66b691832f..1271aefcc0 100644
--- a/src/turbomind/kernels/core/thread_map.h
+++ b/src/turbomind/kernels/core/thread_map.h
@@ -3,6 +3,7 @@
 #pragma once
 
 #include "src/turbomind/kernels/core/common.h"
+#include "src/turbomind/kernels/core/math.h"
 
 #include <iostream>
 
@@ -51,7 +52,7 @@ struct ThreadMapQ {
     }
 };
 
-template<int DimC, int DimS, int AccessC, int WarpCount, int WarpThreadC = DimC / AccessC>
+template<int DimC, int DimS, int AccessC, int WarpCount, int WarpThreadC = lowbit(DimC) / AccessC>
 struct RakedThreadMap {
     static constexpr int kDimC = DimC;
     static constexpr int kDimS = DimS;
diff --git a/src/turbomind/kernels/flash_attention/flash_attention2/CMakeLists.txt b/src/turbomind/kernels/flash_attention/flash_attention2/CMakeLists.txt
index d41c391e9d..81c9750584 100644
--- a/src/turbomind/kernels/flash_attention/flash_attention2/CMakeLists.txt
+++ b/src/turbomind/kernels/flash_attention/flash_attention2/CMakeLists.txt
@@ -8,9 +8,11 @@ add_library(${PROJECT_NAME} STATIC
     # flash_fwd_hdim64_fp16_sm80.cu
     flash_fwd_hdim128_fp16_sm80.cu
     flash_fwd_hdim128_bf16_sm80.cu
-    # flash_fwd_hdim256_fp16_sm80.cu
+    flash_fwd_hdim256_bf16_sm80.cu
+    flash_fwd_hdim256_fp16_sm80.cu
     )
 target_include_directories(${PROJECT_NAME} PRIVATE ${CUTLASS_DIR} / include)
 target_link_libraries(${PROJECT_NAME} PRIVATE nvidia::cutlass::cutlass)
+
 set_property(TARGET ${PROJECT_NAME} PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_property(TARGET ${PROJECT_NAME} PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
diff --git a/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_launch_template.h b/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_launch_template.h
index e108a55f28..2456496367 100644
--- a/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_launch_template.h
+++ b/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_launch_template.h
@@ -147,7 +147,7 @@ void run_mha_fwd_hdim128(Flash_fwd_params& params, cudaStream_t stream)
     });
 }
 
-#if 0
+#if 1
 template<typename T>
 void run_mha_fwd_hdim256(Flash_fwd_params& params, cudaStream_t stream)
 {
diff --git a/src/turbomind/kernels/flash_attention/flash_attention2/static_switch.h b/src/turbomind/kernels/flash_attention/flash_attention2/static_switch.h
index fd19a0ea61..b1df29cb7b 100644
--- a/src/turbomind/kernels/flash_attention/flash_attention2/static_switch.h
+++ b/src/turbomind/kernels/flash_attention/flash_attention2/static_switch.h
@@ -58,6 +58,18 @@
             return __VA_ARGS__();                                                                                      \
         }                                                                                                              \
     }()
+#elif 1
+#define FWD_HEADDIM_SWITCH(HEADDIM, ...)                                                                               \
+    [&] {                                                                                                              \
+        if (HEADDIM <= 128) {                                                                                          \
+            constexpr static int kHeadDim = 128;                                                                       \
+            return __VA_ARGS__();                                                                                      \
+        }                                                                                                              \
+        else if (HEADDIM <= 256) {                                                                                     \
+            constexpr static int kHeadDim = 256;                                                                       \
+            return __VA_ARGS__();                                                                                      \
+        }                                                                                                              \
+    }()
 #else
 #define FWD_HEADDIM_SWITCH(HEADDIM, ...)                                                                               \
     [&] {                                                                                                              \
diff --git a/src/turbomind/kernels/gemm/context.h b/src/turbomind/kernels/gemm/context.h
index 4fec5b732f..bd03917b89 100644
--- a/src/turbomind/kernels/gemm/context.h
+++ b/src/turbomind/kernels/gemm/context.h
@@ -113,12 +113,7 @@ class DynamicGemmContext: public StaticGemmContext {
 
 class MoeGemmContext: public Context {
 public:
-    MoeGemmContext(int experts,
-                   int experts_per_token,
-                   //    int                   output_dims,
-                   //    int                   input_dims,
-                   const cudaDeviceProp& prop,
-                   cudaStream_t          stream);
+    MoeGemmContext(int experts, int experts_per_token, const cudaDeviceProp& prop, cudaStream_t stream);
 
     ~MoeGemmContext() override;
 
@@ -156,9 +151,11 @@ class MoeGemmContext: public Context {
 
     Tape Schedule(const LaunchSpec&) override;
 
-    void set_offsets(const int* offsets)
+    void update(int expert_num, int experts_per_token, const int* offsets)
     {
-        offsets_ = offsets;
+        expert_num_        = expert_num;
+        experts_per_token_ = experts_per_token;
+        offsets_           = offsets;
     }
 
 protected:
diff --git a/src/turbomind/kernels/gemm/convert_v2.cu b/src/turbomind/kernels/gemm/convert_v2.cu
index ed8b2ee2ff..e58bfc9b95 100644
--- a/src/turbomind/kernels/gemm/convert_v2.cu
+++ b/src/turbomind/kernels/gemm/convert_v2.cu
@@ -279,17 +279,44 @@ get_weight_and_scales_layout(DataType dtype, bool is_fused_moe, int sm, bool for
     return {};
 }
 
-void* make_blocked_ptrs(const std::vector<std::pair<void*, int>>& ptrs, cudaStream_t stream)
+namespace {
+
+template<int N>
+struct Param {
+    StridedPtr  data[N];
+    StridedPtr* ptr;
+    int         n;
+};
+
+template<int N>
+__global__ void fill_strided_ptrs(Param<N> param)
 {
-    std::vector<StridedPtr> tmp;
-    for (const auto& [p, s] : ptrs) {
-        tmp.push_back({p, s});
+    const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (idx < param.n) {
+        param.ptr[idx] = param.data[idx];
     }
+}
+
+}  // namespace
+
+void* make_blocked_ptrs(const std::vector<std::pair<void*, int>>& ptrs, cudaStream_t stream)
+{
+    constexpr int N = 64;
+    Param<N>      param{};
+    static_assert(sizeof(param) <= 4096);  // max parameter size for cuda11
     StridedPtr* ptr{};
     cudaMallocAsync(&ptr, sizeof(StridedPtr) * ptrs.size(), stream);
-    cudaMemcpyAsync(ptr, tmp.data(), sizeof(StridedPtr) * ptrs.size(), cudaMemcpyDefault, stream);
-    // Sync before tmp can be destructed
-    cudaStreamSynchronize(stream);
+    param.ptr = ptr;
+    for (int i = 0; i < (int)ptrs.size(); i += N) {
+        const int n = std::min<int>(ptrs.size() - i, N);
+        for (int j = 0; j < n; ++j) {
+            auto& [p, s]  = ptrs[i + j];
+            param.data[j] = StridedPtr{p, s};
+        }
+        param.n = n;
+        fill_strided_ptrs<<<1, N, 0, stream>>>(param);
+        param.ptr += N;
+    }
     return ptr;
 }
 
diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.cu b/src/turbomind/kernels/gemm/moe_utils_v2.cu
index 5912c60a8a..a9e4f7da51 100644
--- a/src/turbomind/kernels/gemm/moe_utils_v2.cu
+++ b/src/turbomind/kernels/gemm/moe_utils_v2.cu
@@ -264,7 +264,8 @@ __global__ void MoeGateKernel_v8(float*       scales,  // [e,n]
                                  int          token_num_padded,
                                  int          expert_num,
                                  int          top_k,
-                                 bool         norm_topk)
+                                 bool         norm_topk,
+                                 float        routed_scale)
 {
     constexpr int max_tiles         = kMoeGateMaxTiles;
     constexpr int threads_per_token = max_expert_num / items_per_thread;  // 8
@@ -286,8 +287,8 @@ __global__ void MoeGateKernel_v8(float*       scales,  // [e,n]
 
     const int warp_ti = threadIdx.x % WARP_SIZE / threads_per_token;
 
-    const int warp_offset  = thread_idx / WARP_SIZE * WARP_SIZE / threads_per_token;
-    const int block_offset = thread_idx / block_dim * block_dim / threads_per_token;
+    // const int warp_offset  = thread_idx / WARP_SIZE * WARP_SIZE / threads_per_token;
+    // const int block_offset = thread_idx / block_dim * block_dim / threads_per_token;
 
     float data[items_per_thread];
     int   idxs[items_per_thread];
@@ -413,7 +414,13 @@ __global__ void MoeGateKernel_v8(float*       scales,  // [e,n]
 
 #endif
 
-    constexpr float kLog2e = 1.4426950408889634074;
+    // constexpr float kLog2e = 1.4426950408889634074;
+    // if (k == 0) {
+    //     PRAGMA_UNROLL
+    //     for (int i = 0; i < items_per_thread; ++i) {
+    //         data[i] *= kLog2e;
+    //     }
+    // }
 
     unsigned mask = (unsigned)-1;
     float    max_logit;
@@ -437,13 +444,6 @@ __global__ void MoeGateKernel_v8(float*       scales,  // [e,n]
             asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit));
         }
 
-        if (k == 0) {
-            PRAGMA_UNROLL
-            for (int i = 0; i < items_per_thread; ++i) {
-                data[i] *= kLog2e;
-            }
-        }
-
         int   g_max_ei  = ei;
         float g_max_val = max_val;
         if constexpr (threads_per_token > 1) {
@@ -486,7 +486,7 @@ __global__ void MoeGateKernel_v8(float*       scales,  // [e,n]
     PRAGMA_UNROLL
     for (int i = 0; i < items_per_thread; ++i) {
         if (!norm_topk || used[i]) {
-            data[i] = exp2f(data[i] - max_logit);
+            data[i] = expf(data[i] - max_logit);
             sum_prob += data[i];
         }
     }
@@ -515,9 +515,11 @@ __global__ void MoeGateKernel_v8(float*       scales,  // [e,n]
 
     PRAGMA_UNROLL
     for (int i = 0; i < max_tiles * max_expert_num; i += block_dim) {
-        int e                   = (i + threadIdx.x) % max_expert_num;
-        int t                   = (i + threadIdx.x) / max_expert_num;
-        smem.shared_accum[t][e] = 0;
+        int e = (i + threadIdx.x) % max_expert_num;
+        int t = (i + threadIdx.x) / max_expert_num;
+        if (t < max_tiles) {
+            smem.shared_accum[t][e] = 0;
+        }
     }
 
     __syncthreads();
@@ -536,10 +538,8 @@ __global__ void MoeGateKernel_v8(float*       scales,  // [e,n]
 
         if (ti2 < token_num && idx < top_k) {
             masks[expert_id * token_num_padded + ti2] = idx;
-            scales[idx * token_num + ti2]             = scale;
+            scales[idx * token_num + ti2]             = scale * routed_scale;
             atomicAdd(&smem.shared_accum[ti2 >> log_tile][expert_id], 1);
-
-            // printf("%d %d %f\n", idx, expert_id, scale);
         }
     }
 
@@ -569,6 +569,7 @@ void invokeMoeGate_V2(int*         f2n,            // [e*n]  -> n
                       int          experts,        //  E
                       int          experts_per_token,
                       bool         norm_topk,
+                      float        routed_scale,
                       cudaStream_t st)
 {
     constexpr int base_log_tile = 9;
@@ -581,14 +582,14 @@ void invokeMoeGate_V2(int*         f2n,            // [e*n]  -> n
 
     // std::cout << log_tile << " " << tiles << "\n";
 
-    auto invoke = [&](auto max_expert_num, auto top_k, auto items_per_thread) {
+    auto invoke = [&](auto max_expert_num, auto top_k, auto items_per_thread, auto vec_size) {
         constexpr int thrs_per_tok = max_expert_num.value / items_per_thread.value;
         constexpr int threads      = 256;
         const int     blocks       = ceil_div(tokens, threads / thrs_per_tok);
 
         cudaMemsetAsync(masks, -1, sizeof(int8_t) * experts * tokens_padded, st);
 
-        MoeGateKernel_v8<max_expert_num.value, top_k.value, items_per_thread.value, threads, 4>
+        MoeGateKernel_v8<max_expert_num.value, top_k.value, items_per_thread.value, threads, vec_size.value>
             <<<blocks, threads, 0, st>>>(  //
                 scales,
                 (int8_t*)masks,
@@ -600,28 +601,49 @@ void invokeMoeGate_V2(int*         f2n,            // [e*n]  -> n
                 tokens_padded,
                 experts,
                 experts_per_token,
-                norm_topk);
+                norm_topk,
+                routed_scale);
     };
 
     auto fail = [&] {
-        std::cerr << "unsupported moe config: expert_num=" << experts << ", top_k=" << experts_per_token << "\n";
+        std::cerr << __FILE__ << "(" << __LINE__ << "): unsupported moe config: expert_num=" << experts
+                  << ", top_k=" << experts_per_token << "\n";
         std::abort();
     };
 
     if (experts <= 8) {
         if (experts_per_token <= 2) {
-            invoke(_Int<8>, _Int<2>, _Int<8>);
+            // MoeGateKernel_V2<2, 128><<<cdiv(tokens, 128), 128, 0, st>>>(scales,
+            //     (int8_t*)masks,
+            //     accum,
+            //     logits,
+            //     log_tile,
+            //     tiles,
+            //     tokens,
+            //     tokens_padded,
+            //     experts);
+
+            // std::cout << tokens << " " << experts << " " << experts_per_token << " " << tokens_padded << "\n";
+            invoke(_Int<8>, _Int<2>, _Int<8>, _Int<4>);
         }
         else {
-            invoke(_Int<8>, _Int<8>, _Int<8>);
+            invoke(_Int<8>, _Int<8>, _Int<8>, _Int<4>);
         }
     }
     else if (experts <= 64) {
         if (experts_per_token <= 4) {
-            invoke(_Int<64>, _Int<4>, _Int<16>);
+            invoke(_Int<64>, _Int<4>, _Int<16>, _Int<4>);
         }
         else if (experts_per_token <= 8) {
-            invoke(_Int<64>, _Int<8>, _Int<16>);
+            invoke(_Int<64>, _Int<8>, _Int<16>, _Int<4>);
+        }
+        else {
+            fail();
+        }
+    }
+    else if (experts <= 160) {
+        if (experts_per_token <= 8) {
+            invoke(_Int<160>, _Int<8>, _Int<10>, _Int<2>);
         }
         else {
             fail();
@@ -687,7 +709,8 @@ __global__ void MoeReduceKernel(T*           dst,         // [  n, d]
                                 const int*   en2f,        // [  e, n] :: (e,n) -> e*n
                                 const float* dst_scales,  // [n]
                                 int          dims,
-                                int          tokens)
+                                int          tokens,
+                                float        dst_scale)
 {
     using Vec = Array<T, vec_size>;
 
@@ -695,7 +718,6 @@ __global__ void MoeReduceKernel(T*           dst,         // [  n, d]
 
     auto dst_ptr = (Vec*)dst + dims * ti;
 
-    float dst_scale = 0;
     if (dst_scales) {
         dst_scale = dst_scales[ti];
         dst_scale = fdividef(1.f, 1.f + expf(-dst_scale));
@@ -711,8 +733,9 @@ __global__ void MoeReduceKernel(T*           dst,         // [  n, d]
     }
 
     for (int i = threadIdx.x; i < dims; i += block_dim) {
+#if 1
         Array<float, vec_size> accum{};
-        if (dst_scales) {
+        if (dst_scale) {
             Vec v;
             Ldg(v, dst_ptr[i].data());
             using namespace ops;
@@ -727,6 +750,24 @@ __global__ void MoeReduceKernel(T*           dst,         // [  n, d]
             accum        = accum + x;
         }
         Store(dst_ptr[i].data(), cast<T>(accum));
+#else
+        Array<T, vec_size> accum{};
+        if (dst_scale) {
+            Vec v;
+            Ldg(v, dst_ptr[i].data());
+            using namespace ops;
+            accum = v * (T)dst_scale;
+        }
+        PRAGMA_UNROLL
+        for (int e = 0; e < exp_k; ++e) {
+            Vec v;
+            Ldg(v, src_ptr[e][i].data());
+            using namespace ops;
+            const auto x = v * (T)scale[e];
+            accum        = accum + x;
+        }
+        Store(dst_ptr[i].data(), accum);
+#endif
     }
 }
 
@@ -739,6 +780,7 @@ void invokeMoeReduce(T*           dst,
                      int          tokens,
                      int          experts_per_token,
                      int          dims,
+                     float        dst_scale,
                      cudaStream_t st)
 {
     // std::cout << __PRETTY_FUNCTION__ << std::endl;
@@ -754,7 +796,8 @@ void invokeMoeReduce(T*           dst,
             en2f,
             dst_scales,
             dims / vec_size,
-            tokens);
+            tokens,
+            dst_scale);
     };
 
     switch (experts_per_token) {
@@ -774,10 +817,11 @@ void invokeMoeReduce(T*           dst,
     }
 }
 
-template void invokeMoeReduce(half*, const half*, const float*, const int*, const float*, int, int, int, cudaStream_t);
-#ifdef ENABLE_BF16
 template void
-invokeMoeReduce(nv_bfloat16*, const nv_bfloat16*, const float*, const int*, const float*, int, int, int, cudaStream_t);
+invokeMoeReduce(half*, const half*, const float*, const int*, const float*, int, int, int, float, cudaStream_t);
+#ifdef ENABLE_BF16
+template void invokeMoeReduce(
+    nv_bfloat16*, const nv_bfloat16*, const float*, const int*, const float*, int, int, int, float, cudaStream_t);
 #endif
 
 std::vector<int> SampleUniform(int token_num, int expert_num, int exp_per_tok, std::mt19937& g)
@@ -833,4 +877,89 @@ std::vector<int> SampleBalanced(int token_num, int expert_num, int exp_per_tok,
     return ret;
 }
 
+template<int max_expert_num, int items_per_thread, int access_size>
+__global__ void MoeMaskTopKGroups(float* logits, int token_num, int expert_num, int top_k)
+{
+    constexpr int threads_per_token = max_expert_num / items_per_thread;
+
+    static_assert((threads_per_token & (threads_per_token - 1)) == 0);
+    static_assert(items_per_thread % access_size == 0);
+
+    const int thread_idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+    const int ti = thread_idx / threads_per_token;
+    const int ei = thread_idx % threads_per_token;
+
+    float data[items_per_thread];
+    PRAGMA_UNROLL
+    for (int i = 0; i < items_per_thread; ++i) {
+        data[i] = -std::numeric_limits<float>::infinity();
+    }
+    float max_val = -std::numeric_limits<float>::infinity();
+    if (ti < token_num) {
+        PRAGMA_UNROLL
+        for (int i = 0; i < items_per_thread; i += access_size) {
+            const int e = ei * items_per_thread + i;
+            if (e < expert_num) {
+                Ldg((Array<float, access_size>&)data[i], &logits[ti * expert_num + e]);
+                PRAGMA_UNROLL
+                for (int c = 0; c < access_size; ++c) {
+                    max_val = fmaxf(max_val, data[i + c]);
+                }
+            }
+        }
+    }
+
+    const int warp_ti        = threadIdx.x % WARP_SIZE / threads_per_token;
+    const int warp_ti_offset = warp_ti * threads_per_token;
+
+    bool alive = false;
+
+    for (int k = 0; k < top_k; ++k) {
+        int   g_max_ei  = ei;
+        float g_max_val = max_val;
+        PRAGMA_UNROLL
+        for (int m = threads_per_token / 2; m >= 1; m /= 2) {
+            g_max_val = fmaxf(g_max_val, __shfl_xor_sync((uint32_t)-1, g_max_val, m));
+        }
+        // tie breaking
+        const auto active = __ballot_sync((uint32_t)-1, max_val == g_max_val);
+        g_max_ei          = __ffs(active >> (unsigned)warp_ti_offset) - 1;
+        if (ei == g_max_ei) {
+            alive   = true;
+            max_val = -std::numeric_limits<float>::infinity();
+        }
+    }
+
+    if (!alive && ti < token_num) {
+        Array<float, access_size> vec;
+        fill(vec, -std::numeric_limits<float>::infinity());
+        PRAGMA_UNROLL
+        for (int i = 0; i < items_per_thread; i += access_size) {
+            const int e = ei * items_per_thread + i;
+            if (e < expert_num) {
+                Store(&logits[ti * expert_num + e], vec);
+            }
+        }
+    }
+}
+
+void invokeMaskMoeTopKGroups(float* logits, int token_num, int expert_num, int group_size, int top_k, cudaStream_t st)
+{
+    auto invoke = [&](auto max_expert_num, auto items_per_thread, auto vec_size) {
+        constexpr int thrs_per_tok = max_expert_num.value / items_per_thread.value;
+        constexpr int threads      = 256;
+        const int     blocks       = ceil_div(token_num, threads / thrs_per_tok);
+        MoeMaskTopKGroups<max_expert_num.value, items_per_thread.value, vec_size.value>
+            <<<blocks, threads, 0, st>>>(logits, token_num, expert_num, top_k);
+    };
+    if (expert_num == 160 && group_size == 20) {
+        return invoke(_Int<160>, _Int<20>, _Int<4>);
+    }
+
+    std::cerr << __FILE__ << "(" << __LINE__ << "): unsupported moe config: expert_num=" << expert_num
+              << ", group_size=" << group_size << "\n";
+    std::abort();
+}
+
 }  // namespace turbomind
diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.h b/src/turbomind/kernels/gemm/moe_utils_v2.h
index 0e4c36af09..d53de1354e 100644
--- a/src/turbomind/kernels/gemm/moe_utils_v2.h
+++ b/src/turbomind/kernels/gemm/moe_utils_v2.h
@@ -22,6 +22,7 @@ void invokeMoeGate_V2(int*         f2n,
                       int          experts,
                       int          exp_per_tok,
                       bool         norm_topk,
+                      float        routed_scale,
                       cudaStream_t st);
 
 template<class T>
@@ -54,8 +55,11 @@ void invokeMoeReduce(T*           dst,
                      int          tokens,
                      int          experts_per_token,
                      int          dims,
+                     float        dst_scale,
                      cudaStream_t st);
 
+void invokeMaskMoeTopKGroups(float* logits, int token_num, int expert_num, int group_size, int top_k, cudaStream_t st);
+
 // Sample `e` from `E` experts uniformly for every token
 std::vector<int> SampleUniform(int token_num, int expert_num, int exp_per_tok, std::mt19937& g);
 
diff --git a/src/turbomind/kernels/gemm/test/test_moe_utils.cu b/src/turbomind/kernels/gemm/test/test_moe_utils.cu
index 47e3bfdb16..4b2ea6a83a 100644
--- a/src/turbomind/kernels/gemm/test/test_moe_utils.cu
+++ b/src/turbomind/kernels/gemm/test/test_moe_utils.cu
@@ -45,72 +45,6 @@ void diff_vecs(const T* data, const T* refs, int m, int k, std::string msg)
     }
 }
 
-#if 0
-void func()
-{
-    using thrust::universal_vector;
-
-    // clang-format off
-    std::vector<float> h_logits{
-        8,  5,  1,  4,  3,  6,  2,  7,
-        50, 60, 90, 20, 70, 71, 72, 73,
-        0, 1, 0, 0, 0, 1, 0, 1,
-        0, 0, 0, 1, 0, 0, 0, 2};
-    // clang-format on
-
-    h_logits.resize(8);
-
-    // auto tmp = h_logits;
-    // for (int i = 0; i < 127; ++i) {
-    //     h_logits.insert(h_logits.end(), tmp.begin(), tmp.end());
-    // }
-
-    universal_vector<float> logits(h_logits.begin(), h_logits.end());
-
-    const int E = 8;
-    const int n = h_logits.size() / E;
-    const int e = 2;
-
-    const int n_padded = (n + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize;
-
-    universal_vector<int>   f2n(e * n);
-    universal_vector<int>   en2f(e * n);
-    universal_vector<int>   offsets(E + 1);
-    universal_vector<int>   accum(E * kMoeGateMaxTiles);
-    universal_vector<float> scales(n * e);
-    universal_vector<int>   masks(E * n_padded);
-
-    for (int i = 0; i < 10; ++i) {
-        gemm::CacheFlushing::flush(0);
-        cudaMemset(accum.data().get(), 0, sizeof(int) * accum.size());
-        invokeMoeGate_V2(f2n.data().get(),
-                         en2f.data().get(),
-                         offsets.data().get(),
-                         scales.data().get(),
-                         masks.data().get(),
-                         accum.data().get(),
-                         logits.data().get(),
-                         n,
-                         n_padded,
-                         E,
-                         e,
-                         0);
-    }
-
-    auto err = cudaDeviceSynchronize();
-    if (err) {
-        std::cerr << cudaGetErrorString(err) << "\n";
-    }
-
-    print_vecs(scales.data().get(), e, n, "scales", 12);
-    print_vecs(masks.data().get(), E, n_padded, "tmp");
-    print_vecs(accum.data().get(), E, 1, "accum");
-    print_vecs(offsets.data().get(), 1, E + 1, "offsets");
-    print_vecs(f2n.data().get(), n * e, 1, "f2n");
-    print_vecs(en2f.data().get(), e, n, "en2f");
-}
-#endif
-
 RNG& gRNG()
 {
     static RNG inst{};
@@ -271,6 +205,8 @@ bool test_moe_gate(int                     tokens,  //
     cudaMemPrefetchAsync(scales.data().get(), sizeof(float) * scales.size(), 0);
     cudaMemPrefetchAsync(logits.data().get(), sizeof(float) * logits.size(), 0);
 
+    // invokeMaskMoeTopKGroups(logits.data().get(), tokens, expert_num, expert_num / 8, 3, nullptr);
+
     for (int i = 0; i < 1; ++i) {
         gemm::CacheFlushing::flush();
         cudaMemset(accum.data().get(), 0, sizeof(int) * accum.size());
@@ -286,8 +222,9 @@ bool test_moe_gate(int                     tokens,  //
                          tokens_padded,
                          expert_num,
                          experts_per_token,
-                         true,
-                         0);
+                         false,
+                         1.f,
+                         nullptr);
     }
 
     // invokeMoeTiling(coords.data().get(), offsets.data().get(), expert_num, coords.size(), &tiling, 1, 0);
@@ -334,6 +271,8 @@ bool test_moe_gate(int                     tokens,  //
         success = false;
     }
 
+    // print_vecs(logits.data().get(), tokens, expert_num, "logits", 12);
+
     if (!success && 1) {
 
         diff_vecs(eids.data().get(), eids_ref.data().get(), experts_per_token, tokens, "eids");
@@ -353,6 +292,15 @@ bool test_moe_gate(int                     tokens,  //
         print_vecs(scales_ref.data().get(), experts_per_token, tokens, "scales_ref", 12);
         print_vecs(scales.data().get(), experts_per_token, tokens, "scales", 12);
 
+        for (int i = 0; i < tokens; ++i) {
+            float sum = 0;
+            for (int j = 0; j < experts_per_token; ++j) {
+                sum += scales[j * tokens + i];
+            }
+            std::cout << sum << " ";
+        }
+        std::cout << "\n";
+
         // print_vecs(accum.data().get(), expert_num, 1, "accum");
 
         // print_vecs(coords.data().get(), 1, max_coords, "coords");
@@ -393,7 +341,7 @@ int main()
     // test_moe_gate(32768, 64, 8, tape, tiling);
     // test_moe_gate(8, 60, 4, tape, tiling);
 
-    test_moe_gate(65536, 8, 2, tape, tiling);
+    test_moe_gate(16, 160, 6, tape, tiling);
     return 0;
 
     for (int i = 1; i < 16384; ++i) {
diff --git a/src/turbomind/kernels/gemm/test/testbed.h b/src/turbomind/kernels/gemm/test/testbed.h
index 7a089fbdf2..4747644f9a 100644
--- a/src/turbomind/kernels/gemm/test/testbed.h
+++ b/src/turbomind/kernels/gemm/test/testbed.h
@@ -357,7 +357,7 @@ class Testbed {
             }
         }
 
-        ((MoeGemmContext*)ctx_.get())->set_offsets(moe_m_offsets_.data().get());
+        ((MoeGemmContext*)ctx_.get())->update(experts_, exp_per_tok_, moe_m_offsets_.data().get());
 
         CHECK(batch_dim == 0);
         CHECK(a_desc_.order == kRowMajor);
@@ -518,6 +518,7 @@ class Testbed {
                             batch_size_,
                             expert_ids_.size() / batch_size_,
                             output_dims_,
+                            0.f,
                             stream_);
 
             invokeMoeReduce(c_ref_.data().get(),
@@ -528,6 +529,7 @@ class Testbed {
                             batch_size_,
                             expert_ids_.size() / batch_size_,
                             output_dims_,
+                            0.f,
                             stream_);
 
             cudaDeviceSynchronize();
diff --git a/src/turbomind/kernels/gemm/unpack.cu b/src/turbomind/kernels/gemm/unpack.cu
index 92f468d82b..39e6a2e1aa 100644
--- a/src/turbomind/kernels/gemm/unpack.cu
+++ b/src/turbomind/kernels/gemm/unpack.cu
@@ -71,14 +71,44 @@ void unpack_awq_gemm(uint4_t* dst, const uint4_t* src, int rows, int cols, cudaS
     permute_u4<0, 1, 3, 2><<<512, 512, 0, st>>>((uint*)dst, (const uint*)src, shape);
 }
 
+__global__ void transpose_u4_kernel(uint4_t* dst, const uint4_t* src, int s, int c)
+{
+    const int idx_c = 8 * (threadIdx.x + blockIdx.x * blockDim.x);
+    const int idx_s = 8 * (threadIdx.y + blockIdx.y * blockDim.y);
+    if (idx_c >= c || idx_s >= s) {
+        return;
+    }
+    uint32_t ivec[8];
+    PRAGMA_UNROLL
+    for (int i = 0; i < 8; ++i) {
+        ivec[i] = ((const uint32_t*)src)[((idx_s + i) * c + idx_c) / 8];
+    }
+    uint32_t ovec[8]{};
+    PRAGMA_UNROLL
+    for (int i = 0; i < 8; ++i) {
+        PRAGMA_UNROLL
+        for (int j = 0; j < 8; ++j) {
+            ovec[i] |= (((ivec[j] >> (i * 4)) & 0xfu) << (j * 4));
+        }
+    }
+    PRAGMA_UNROLL
+    for (int i = 0; i < 8; ++i) {
+        ((uint32_t*)dst)[((idx_c + i) * s + idx_s) / 8] = ovec[i];
+    }
+}
+
 void transpose_u4(uint4_t* dst, const uint4_t* src, int s, int c, cudaStream_t st)
 {
     if (s % 8 || c % 8) {
         std::cerr << "transpose_u4: invalid shape (" << s << "," << c << "), must be multiple of 8" << std::endl;
         return;
     }
-    Array<int, 2> shape{s, c};
-    permute_u4<1, 0><<<512, 512, 0, st>>>((uint*)dst, (const uint*)src, shape);
+    // Array<int, 2> shape{s, c};
+    // permute_u4<1, 0><<<512, 512, 0, st>>>((uint*)dst, (const uint*)src, shape);
+
+    const dim3 block(16, 16);
+    const dim3 grid((c + 15) / 16, (s + 15) / 16);
+    transpose_u4_kernel<<<grid, block, 0, st>>>(dst, src, s, c);
 }
 
 // load -> unpack -> extend_to_u8 -> manipulation -> compat_to_u4 -> store
diff --git a/src/turbomind/kernels/norm/CMakeLists.txt b/src/turbomind/kernels/norm/CMakeLists.txt
new file mode 100644
index 0000000000..bc1569c405
--- /dev/null
+++ b/src/turbomind/kernels/norm/CMakeLists.txt
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+add_library(rms_norm rms_norm.cu)
+set_property(TARGET rms_norm PROPERTY POSITION_INDEPENDENT_CODE ON)
+set_property(TARGET rms_norm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
diff --git a/src/turbomind/kernels/norm/rms_norm.cu b/src/turbomind/kernels/norm/rms_norm.cu
new file mode 100644
index 0000000000..22fd69f52a
--- /dev/null
+++ b/src/turbomind/kernels/norm/rms_norm.cu
@@ -0,0 +1,235 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "cub/block/block_reduce.cuh"
+
+#include "src/turbomind/kernels/core/array_ops.h"
+#include "src/turbomind/kernels/core/common.h"
+
+namespace turbomind {
+
+template<class T, class Accum, int block_dim, int vec_size>
+__global__ void RMSNormKernel(T*       dst,
+                              int      dst_ld,
+                              const T* src,
+                              int      src_ld,
+                              const T* __restrict__ weights,
+                              int   dims,
+                              int   num,
+                              float eps,
+                              float inv_dims)
+{
+    const int ti = blockIdx.x;
+    const int di = threadIdx.x * vec_size;
+
+    if (ti >= num) {
+        return;
+    }
+
+    src += src_ld * ti;
+
+    Array<Accum, vec_size> accum{};
+    Array<T, vec_size>     vec;
+
+    for (int i = di; i < dims; i += block_dim * vec_size) {
+        Load(vec, &src[i]);
+        Array<Accum, vec_size> tmp = cast<Accum>(vec);
+        using namespace ops;
+        accum = accum + tmp * tmp;
+    }
+
+    float sum{};
+    PRAGMA_UNROLL
+    for (int i = 0; i < vec_size; ++i) {
+        sum += accum[i];
+    }
+
+    using BlockReduce = cub::BlockReduce<Accum, block_dim>;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+
+    sum = BlockReduce{temp_storage}.Sum(sum);
+
+    __shared__ float shared_sum;
+
+    if (threadIdx.x == 0) {
+        shared_sum = rsqrtf(sum * inv_dims + eps);
+    }
+
+    __syncthreads();
+
+    sum = shared_sum;
+
+    dst += dst_ld * ti;
+
+    Array<T, vec_size> sv;
+    for (int i = di; i < dims; i += block_dim * vec_size) {
+        Load(vec, &src[i]);
+        Ldg(sv, &weights[i]);
+        PRAGMA_UNROLL
+        for (int c = 0; c < vec_size; ++c) {
+            vec[c] = (T)((float)vec[c] * sum) * sv[c];
+            // vec[c] = (T)((float)vec[c] * sum * (float)sv[c]);
+        }
+        Store(&dst[i], vec);
+    }
+}
+
+template<class T>
+void invokeRMSNorm(
+    T* dst, int dst_ld, const T* src, int src_ld, const T* weights, int dims, int num, float eps, cudaStream_t st)
+{
+    constexpr int vec_size = 16 / sizeof(T);
+
+    constexpr int threads = 512;
+    const int     blocks  = num;
+
+    RMSNormKernel<T, float, threads, vec_size><<<blocks, threads, 0, st>>>(dst,  //
+                                                                           dst_ld,
+                                                                           src,
+                                                                           src_ld,
+                                                                           weights,
+                                                                           dims,
+                                                                           num,
+                                                                           eps,
+                                                                           1.f / dims);
+}
+
+template void invokeRMSNorm(half*        dst,
+                            int          dst_ld,
+                            const half*  src,
+                            int          src_ld,
+                            const half*  weights,
+                            int          dims,
+                            int          num,
+                            float        eps,
+                            cudaStream_t st);
+#if ENABLE_BF16
+template void invokeRMSNorm(nv_bfloat16*       dst,
+                            int                dst_ld,
+                            const nv_bfloat16* src,
+                            int                src_ld,
+                            const nv_bfloat16* weights,
+                            int                dims,
+                            int                num,
+                            float              eps,
+                            cudaStream_t       st);
+#endif
+
+// r' <- r + (h + b)
+// h' <- norm(r') * w
+template<class T, class Tacc, int block_dim, int vec_size>
+__global__ void BiasResidualRMSNormKernel(T* __restrict__ residual,
+                                          T* __restrict__ hidden_states,
+                                          const T* __restrict__ weights,
+                                          const T* __restrict__ bias,
+                                          int   dims,
+                                          int   num,
+                                          float eps,
+                                          float inv_dims)
+{
+    const int ti = blockIdx.x;
+    const int di = threadIdx.x * vec_size;
+
+    if (ti >= num) {
+        return;
+    }
+
+    residual += dims * ti;
+    hidden_states += dims * ti;
+
+    Array<Tacc, vec_size> accum{};
+
+    Array<T, vec_size> r_vec;
+    Array<T, vec_size> h_vec;
+    Array<T, vec_size> b_vec;
+
+    for (int i = di; i < dims; i += block_dim * vec_size) {
+        Load(r_vec, &residual[i]);
+        Load(h_vec, &hidden_states[i]);
+
+        using namespace ops;
+        r_vec = r_vec + h_vec;
+
+        if (bias) {
+            Ldg(b_vec, &bias[i]);
+            r_vec = r_vec + b_vec;
+        }
+
+        Store(&residual[i], r_vec);
+
+        Array<Tacc, vec_size> tmp = cast<Tacc>(r_vec);
+
+        accum = accum + tmp * tmp;
+    }
+
+    float sum{};
+    PRAGMA_UNROLL
+    for (int i = 0; i < vec_size; ++i) {
+        sum += accum[i];
+    }
+
+    using BlockReduce = cub::BlockReduce<Tacc, block_dim>;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+
+    sum = BlockReduce{temp_storage}.Sum(sum);
+
+    __shared__ float shared_sum;
+
+    if (threadIdx.x == 0) {
+        shared_sum = rsqrtf(sum * inv_dims + eps);
+    }
+
+    __syncthreads();
+
+    sum = shared_sum;
+
+    Array<T, vec_size> w_vec;
+    for (int i = di; i < dims; i += block_dim * vec_size) {
+        Load(r_vec, &residual[i]);
+        Ldg(w_vec, &weights[i]);
+        PRAGMA_UNROLL
+        for (int c = 0; c < vec_size; ++c) {
+            r_vec[c] = (T)((float)r_vec[c] * sum) * w_vec[c];
+        }
+        Store(&hidden_states[i], r_vec);
+    }
+}
+
+template<class T>
+void invokeBiasResidualRMSNorm(
+    T* residual, T* hidden_states, const T* weights, const T* bias, int dims, int num, float eps, cudaStream_t st)
+{
+    constexpr int vec_size = 16 / sizeof(T);
+    constexpr int threads  = 512;
+    const int     blocks   = num;
+
+    BiasResidualRMSNormKernel<T, float, threads, vec_size><<<blocks, threads, 0, st>>>(residual,  //
+                                                                                       hidden_states,
+                                                                                       weights,
+                                                                                       bias,
+                                                                                       dims,
+                                                                                       num,
+                                                                                       eps,
+                                                                                       1.f / dims);
+}
+
+template void invokeBiasResidualRMSNorm(half*        residual,
+                                        half*        hidden_states,
+                                        const half*  weights,
+                                        const half*  bias,
+                                        int          dims,
+                                        int          num,
+                                        float        eps,
+                                        cudaStream_t st);
+
+#if ENABLE_BF16
+template void invokeBiasResidualRMSNorm(nv_bfloat16*       residual,
+                                        nv_bfloat16*       hidden_states,
+                                        const nv_bfloat16* weights,
+                                        const nv_bfloat16* bias,
+                                        int                dims,
+                                        int                num,
+                                        float              eps,
+                                        cudaStream_t       st);
+#endif
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/norm/rms_norm.h b/src/turbomind/kernels/norm/rms_norm.h
new file mode 100644
index 0000000000..83fa0f8263
--- /dev/null
+++ b/src/turbomind/kernels/norm/rms_norm.h
@@ -0,0 +1,21 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include <cuda_runtime.h>
+
+namespace turbomind {
+
+template<class T>
+void invokeRMSNorm(
+    T* dst, int dst_ld, const T* src, int src_ld, const T* weights, int dims, int num, float eps, cudaStream_t st);
+
+template<class T>
+void invokeRMSNorm(T* dst, const T* src, const T* weights, int dims, int num, float eps, cudaStream_t st)
+{
+    invokeRMSNorm(dst, dims, src, dims, weights, dims, num, eps, st);
+}
+
+template<class T>
+void invokeBiasResidualRMSNorm(
+    T* residual, T* hidden_states, const T* weights, const T* bias, int dims, int num, float eps, cudaStream_t st);
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/llama/CMakeLists.txt b/src/turbomind/models/llama/CMakeLists.txt
index 285fcea31f..3c714bd234 100644
--- a/src/turbomind/models/llama/CMakeLists.txt
+++ b/src/turbomind/models/llama/CMakeLists.txt
@@ -20,11 +20,13 @@ add_library(Llama STATIC
         unified_attention_layer.cc
         llama_kernels.cu
         llama_decoder_kernels.cu
-        llama_utils.cu)
+        llama_utils.cu
+        mla_utils.cu)
 set_property(TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE  ON)
 set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
 target_link_libraries(Llama PUBLIC CUDA::cudart
         gemm2
+        rms_norm
         cublasMMWrapper
         DynamicDecodeLayer
         activation_kernels
diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc
index 4138174e5d..ea321d06a0 100644
--- a/src/turbomind/models/llama/LlamaBatch.cc
+++ b/src/turbomind/models/llama/LlamaBatch.cc
@@ -20,6 +20,7 @@
 #include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/debug_utils.h"
 #include "src/turbomind/utils/logger.h"
+#include "src/turbomind/utils/nccl_utils.h"
 #include <algorithm>
 #include <cmath>
 #include <cstddef>
@@ -1041,6 +1042,9 @@ LlamaBatch<T>::LlamaBatch(const EngineParam&           param,
 
     AllocateBuffer(max_batch_size_, session_len_, cache_block_seq_len);
     AllocatePersistantBuffer(max_batch_size_, cache_block_seq_len);
+
+    // Wait for allocations
+    check_cuda_error(cudaStreamSynchronize(stream_));
 }
 
 template<typename T>
@@ -1990,7 +1994,7 @@ void LlamaBatch<T>::tune()
                                    nullptr,
                                    nullptr);
             // implicit barrier for TP
-            check_cuda_error(cudaStreamSynchronize(stream_));
+            ftNcclStreamSynchronize(model_->tensor_para_, {}, stream_);
         }
 
         auto tock = std::chrono::steady_clock::now();
diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
index f6f9ab0efa..0a2a3be175 100644
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
@@ -52,28 +52,21 @@ static bool is_fuse_silu_act()
 }
 
 template<typename T>
-LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int        layer_idx,
-                                                    size_t     head_num,
-                                                    size_t     kv_head_num,
-                                                    size_t     size_per_head,
-                                                    size_t     hidden_units,
-                                                    size_t     inter_size,
-                                                    WeightType weight_type,
-                                                    int        group_size,
-                                                    LoraParam  lora_param,
-                                                    bool       attn_bias,
-                                                    MoeParam   moe_param,
-                                                    size_t     tensor_para_size,
-                                                    size_t     tensor_para_rank):
-    head_num_(head_num),
-    kv_head_num_(kv_head_num),
-    size_per_head_(size_per_head),
-    hidden_units_(hidden_units),
-    inter_size_(inter_size),
-    weight_type_(weight_type),
-    attn_bias_(attn_bias),
-    tensor_para_size_(tensor_para_size),
-    tensor_para_rank_(tensor_para_rank)
+LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int               layer_id,
+                                                    const ModelParam& model,
+                                                    const LoraParam&  lora_param,
+                                                    const MoeParam&   moe_param,
+                                                    size_t            tp_size,
+                                                    size_t            tp_rank):
+    head_num_(model.head_num),
+    kv_head_num_(model.kv_head_num),
+    size_per_head_(model.head_dim),
+    hidden_units_(model.hidden_units),
+    inter_size_(model.inter_size.at(layer_id)),
+    weight_type_(model.weight_type),
+    attn_bias_(model.attn_bias),
+    tensor_para_size_(tp_size),
+    tensor_para_rank_(tp_rank)
 {
     if (lora_param.policy == LoraPolicy::kPlora) {
         std::vector<std::string> keys = {
@@ -88,7 +81,7 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int        layer_idx,
             auto&       weight    = *weights[i];
             int         rank      = lora_param.r;
             float       scale     = lora_param.scale;
-            std::string full_name = "layers." + std::to_string(layer_idx) + "." + name;
+            std::string full_name = "layers." + std::to_string(layer_id) + "." + name;
 
             for (const auto& [re, pr] : lora_param.rank_pattern) {
                 if (std::regex_search(full_name, pr.first)) {
@@ -114,36 +107,44 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int        layer_idx,
 
     fused_up_and_gate_ = ffn_weights.gating.lora.policy != LoraPolicy::kPlora;
 
-    self_attn_weights.qkv.input_dims  = hidden_units_;
-    self_attn_weights.qkv.output_dims = (head_num + 2 * kv_head_num) * size_per_head / tensor_para_size_;
-    self_attn_weights.qkv.type        = weight_type;
-    self_attn_weights.qkv.group_size  = group_size;
-
-    self_attn_weights.output.input_dims  = (head_num * size_per_head) / tensor_para_size_;
-    self_attn_weights.output.output_dims = hidden_units_;
-    self_attn_weights.output.type        = weight_type;
-    self_attn_weights.output.group_size  = group_size;
+    self_attn_weights = LlamaAttentionWeight<T>{hidden_units_,
+                                                size_per_head_,
+                                                head_num_,
+                                                kv_head_num_,
+                                                model.mla,
+                                                attn_bias_,
+                                                tensor_para_size_,
+                                                weight_type_,
+                                                model.group_size};
 
     ffn_weights = LlamaFfnWeight<T>{
         hidden_units_,
         inter_size_,
         tensor_para_size_,
         weight_type_,
-        group_size,
+        model.group_size,
         weight_type_ == WeightType::kINT4 && is_fuse_silu_act(),
     };
 
-    moe_weights = MoeFfnWeight<T>{hidden_units_,
-                                  moe_param.inter_size,
-                                  moe_param.expert_num,
-                                  moe_param.method,
-                                  moe_param.shared_gate,
-                                  tensor_para_size_,
-                                  weight_type,
-                                  group_size,
-                                  is_fuse_silu_act()};
-
-    mallocWeights();
+    moe_weights = MoeFfnWeight<T>{
+        layer_id, moe_param, hidden_units_, weight_type_, model.group_size, tensor_para_size_, is_fuse_silu_act()};
+}
+
+template<typename T>
+void LlamaDecoderLayerWeight<T>::malloc(cudaStream_t st)
+{
+    deviceMalloc((T**)&self_attn_norm_weights, hidden_units_, st);
+    deviceMalloc((T**)&ffn_norm_weights, hidden_units_, st);
+
+    self_attn_weights.malloc(st);
+
+    if (inter_size_) {
+        ffn_weights.malloc(st);
+    }
+
+    if (!moe_weights.experts.empty()) {
+        moe_weights.malloc(st);
+    }
 }
 
 template<typename T>
@@ -168,52 +169,6 @@ size_t LlamaDecoderLayerWeight<T>::workspace_size() const noexcept
     return size * sizeof(uint16_t);
 }
 
-template<typename T>
-void freeWeights(LlamaDenseWeight<T>& weights)
-{
-    cudaFree(weights.kernel);
-    cudaFree(weights.bias);
-    cudaFree(weights.scales);
-    cudaFree(weights.zeros);
-
-    weights.kernel = nullptr;
-    weights.bias   = nullptr;
-    weights.scales = nullptr;
-    weights.zeros  = nullptr;
-
-    {
-        cudaFree(weights.lora.a);
-        cudaFree(weights.lora.b);
-        weights.lora.a = nullptr;
-        weights.lora.b = nullptr;
-    }
-}
-
-template<typename T>
-void LlamaDecoderLayerWeight<T>::mallocWeights(LlamaDenseWeight<T>& weights, bool bias)
-{
-    if (bias) {
-        deviceMalloc((T**)&weights.bias, weights.output_dims);
-    }
-    const size_t bit_size = getBitSize(weights.type);
-    if (bit_size >= 16) {  // fp16, fp32
-        deviceMalloc((T**)&weights.kernel, weights.input_dims * weights.output_dims);
-    }
-    else {  // int8, int4
-        const int factor = sizeof(float) * 8 / bit_size;
-        FT_CHECK(weights.input_dims % factor == 0);
-        deviceMalloc((int**)&weights.kernel, weights.input_dims * weights.output_dims / factor);
-        deviceMemSetZero((int*)weights.kernel, weights.input_dims * weights.output_dims / factor);
-        deviceMalloc((T**)&weights.scales, weights.input_dims / weights.group_size * weights.output_dims);
-        deviceMalloc((T**)&weights.zeros, weights.input_dims / weights.group_size * weights.output_dims);
-    }
-
-    if (weights.lora.r > 0) {
-        deviceMalloc((T**)&weights.lora.a, weights.input_dims * weights.lora.r);
-        deviceMalloc((T**)&weights.lora.b, weights.lora.r * weights.output_dims);
-    }
-}
-
 template<typename FirstArg, typename... Args>
 std::string concat(FirstArg&& first, Args&&... args)
 {
@@ -342,64 +297,24 @@ void loadWeights(LlamaDenseWeight<T>& w, std::string prefix, FtCudaDataType mode
 }
 
 template<typename T>
-void LlamaDecoderLayerWeight<T>::mallocWeights()
+void LlamaDecoderLayerWeight<T>::free(cudaStream_t st)
 {
-    deviceMalloc((T**)&self_attn_norm_weights, hidden_units_);
-    deviceMalloc((T**)&ffn_norm_weights, hidden_units_);
+    deviceFree(self_attn_norm_weights, st);
+    deviceFree(ffn_norm_weights, st);
 
-    mallocWeights(self_attn_weights.qkv, attn_bias_);
-    mallocWeights(self_attn_weights.output, attn_bias_);
+    self_attn_weights.free(st);
 
     if (inter_size_) {
-        mallocWeights(ffn_weights.gating, false);
-        mallocWeights(ffn_weights.intermediate, false);
-        mallocWeights(ffn_weights.output, false);
+        ffn_weights.free(st);
     }
 
     if (!moe_weights.experts.empty()) {
-        mallocWeights(moe_weights.gate, false);
-        for (auto& e : moe_weights.experts) {
-            mallocWeights(e.gating, false);
-            mallocWeights(e.intermediate, false);
-            mallocWeights(e.output, false);
-        }
-        if (moe_weights.shared_gate.output_dims) {
-            mallocWeights(moe_weights.shared_gate, false);
-        }
+        moe_weights.free(st);
     }
 }
 
 template<typename T>
-LlamaDecoderLayerWeight<T>::~LlamaDecoderLayerWeight()
-{
-    cudaFree((void*)self_attn_norm_weights);
-    cudaFree((void*)ffn_norm_weights);
-    self_attn_norm_weights = nullptr;
-    ffn_norm_weights       = nullptr;
-
-    freeWeights(self_attn_weights.qkv);
-    freeWeights(self_attn_weights.output);
-
-    if (inter_size_) {
-        freeWeights(ffn_weights.fused_gating_intermediate);
-        freeWeights(ffn_weights.gating);
-        freeWeights(ffn_weights.intermediate);
-        freeWeights(ffn_weights.output);
-    }
-
-    if (!moe_weights.experts.empty()) {
-        freeWeights(moe_weights.gate);
-        for (auto& e : moe_weights.experts) {
-            freeWeights(e.fused_gating_intermediate);
-            freeWeights(e.gating);
-            freeWeights(e.intermediate);
-            freeWeights(e.output);
-        }
-        if (moe_weights.shared_gate.kernel) {
-            freeWeights(moe_weights.shared_gate);
-        }
-    }
-}
+LlamaDecoderLayerWeight<T>::~LlamaDecoderLayerWeight() = default;
 
 template<typename T>
 void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType model_file_type)
@@ -432,6 +347,24 @@ void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType
     }
 }
 
+template<class T>
+void getMLATensor(LlamaAttentionWeight<T>& w, const std::string& p, TensorMap& m, int tp_rank)
+{
+    if (w.q_proj.output_dims) {
+        getWeightTensor(w.q_proj, false, concat(p, "attention.q_proj", tp_rank), m);
+    }
+    else {
+        getWeightTensor(w.q_a_proj, false, concat(p, "attention.q_a_proj"), m);
+        getWeightTensor(w.q_b_proj, false, concat(p, "attention.q_b_proj", tp_rank), m);
+        m.insert(concat(p, "attention.q_a_layernorm"),
+                 Tensor{MEMORY_GPU, getTensorType<T>(), {sizeof(T) * w.q_b_proj.input_dims}, w.q_a_layernorm});
+    }
+    getWeightTensor(w.kv_a_proj, false, concat(p, "attention.kv_a_proj"), m);
+    getWeightTensor(w.kv_b_proj, false, concat(p, "attention.kv_b_proj", tp_rank), m);
+    m.insert(concat(p, "attention.kv_a_layernorm"),
+             Tensor{MEMORY_GPU, getTensorType<T>(), {sizeof(T) * w.kv_b_proj.input_dims}, w.kv_a_layernorm});
+}
+
 template<typename T>
 TensorMap LlamaDecoderLayerWeight<T>::getParams(std::string prefix)
 {
@@ -445,7 +378,12 @@ TensorMap LlamaDecoderLayerWeight<T>::getParams(std::string prefix)
 
     auto get_prefix = [=](std::string_view name) { return concat(prefix, name, tensor_para_rank_); };
 
-    getWeightTensor(self_attn_weights.qkv, attn_bias_, get_prefix("attention.w_qkv"), output);
+    if (self_attn_weights.qkv.output_dims) {
+        getWeightTensor(self_attn_weights.qkv, attn_bias_, get_prefix("attention.w_qkv"), output);
+    }
+    else {
+        getMLATensor(self_attn_weights, prefix, output, tensor_para_rank_);
+    }
     getWeightTensor(self_attn_weights.output, attn_bias_, get_prefix("attention.wo"), output);
 
     if (inter_size_) {
@@ -478,7 +416,8 @@ TensorMap LlamaDecoderLayerWeight<T>::getParams(std::string prefix)
 }
 
 // template<class T>
-static void convert_u4(LlamaDenseWeight<half>& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt)
+static void convert_u4(
+    LlamaDenseWeight<half>& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt, cudaStream_t st)
 {
     FT_CHECK(weight.type == WeightType::kINT4);
 
@@ -488,11 +427,11 @@ static void convert_u4(LlamaDenseWeight<half>& weight, bool is_fused_moe, void*
         get_weight_and_scales_layout(gemm::DataType::U4, is_fused_moe, getSMVersion(), use_simt);
 
     if (order_b == kColMajor) {
-        transpose_u4((uint4_t*)workspace, (const uint4_t*)weight.kernel, weight.input_dims, weight.output_dims);
-        cudaMemcpy(weight.kernel, workspace, weight.input_dims * weight.output_dims / 2, cudaMemcpyDefault);
+        transpose_u4((uint4_t*)workspace, (const uint4_t*)weight.kernel, weight.input_dims, weight.output_dims, st);
+        cudaMemcpyAsync(weight.kernel, workspace, weight.input_dims * weight.output_dims / 2, cudaMemcpyDefault, st);
     }
 
-    extend_to_u16((uint16_t*)workspace, (const uint4_t*)weight.kernel, weight.input_dims * weight.output_dims);
+    extend_to_u16((uint16_t*)workspace, (const uint4_t*)weight.kernel, weight.input_dims * weight.output_dims, st);
     sync_check_cuda_error();
 
     MatrixLayout w_desc{
@@ -507,25 +446,22 @@ static void convert_u4(LlamaDenseWeight<half>& weight, bool is_fused_moe, void*
     k_desc.type         = gemm::DataType::U4;
     k_desc.pack         = pack_b;
 
-    cudaMemset(weight.kernel, 0, weight.input_dims * weight.output_dims / 2);
+    cudaMemsetAsync(weight.kernel, 0, weight.input_dims * weight.output_dims / 2, st);
 
-    FT_CHECK(Convert(workspace, w_desc, weight.kernel, k_desc, 0) == 0);
+    FT_CHECK(Convert(workspace, w_desc, weight.kernel, k_desc, st) == 0);
     sync_check_cuda_error();
 
     const int scale_count = (weight.input_dims / weight.group_size) * weight.output_dims;
 
     // std::cout << "fuse_scales_and_zeros\n";
-    fuse_scales_and_zeros((half*)workspace, weight.scales, weight.zeros, scale_count);
+    fuse_scales_and_zeros((half*)workspace, weight.scales, weight.zeros, scale_count, st);
     // cudaMemset((T*)workspace, 0, sizeof(T) * scale_count * 2);
     sync_check_cuda_error();
 
-    cudaDeviceSynchronize();
-
-    cudaFree(weight.scales);
-    cudaFree(weight.zeros);
-    weight.scales = weight.zeros = nullptr;
+    deviceFree(weight.scales, st);
+    deviceFree(weight.zeros, st);
 
-    deviceMalloc((half**)&weight.scales_zeros, scale_count * 2);
+    deviceMalloc((half**)&weight.scales_zeros, scale_count * 2, st);
 
     MatrixLayout s_desc{
         gemm::DataType::U32,
@@ -538,7 +474,7 @@ static void convert_u4(LlamaDenseWeight<half>& weight, bool is_fused_moe, void*
     MatrixLayout q_desc = s_desc;
     q_desc.pack         = pack_v;
 
-    FT_CHECK(Convert(workspace, s_desc, weight.scales_zeros, q_desc, 0) == 0);
+    FT_CHECK(Convert(workspace, s_desc, weight.scales_zeros, q_desc, st) == 0);
     sync_check_cuda_error();
 
     weight.k_desc = k_desc;
@@ -548,7 +484,8 @@ static void convert_u4(LlamaDenseWeight<half>& weight, bool is_fused_moe, void*
 }
 
 template<class T>
-static void convert_fp(LlamaDenseWeight<T>& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt)
+static void
+convert_fp(LlamaDenseWeight<T>& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt, cudaStream_t st)
 {
     using namespace gemm;
 
@@ -563,12 +500,13 @@ static void convert_fp(LlamaDenseWeight<T>& weight, bool is_fused_moe, void* wor
     const int output_dim = weight.output_dims;
 
     if (order_b == kColMajor) {
-        invokeTransposeAxis01((uint16_t*)workspace, (uint16_t*)weight.kernel, input_dim, output_dim, 1, nullptr);
+        invokeTransposeAxis01((uint16_t*)workspace, (uint16_t*)weight.kernel, input_dim, output_dim, 1, st);
         sync_check_cuda_error();
         // FT_CHECK(0);
     }
     else {
-        check_cuda_error(cudaMemcpy(workspace, weight.kernel, sizeof(T) * input_dim * output_dim, cudaMemcpyDefault));
+        check_cuda_error(
+            cudaMemcpyAsync(workspace, weight.kernel, sizeof(T) * input_dim * output_dim, cudaMemcpyDefault, st));
     }
 
     MatrixLayout src{
@@ -583,35 +521,42 @@ static void convert_fp(LlamaDenseWeight<T>& weight, bool is_fused_moe, void* wor
     dst.pack         = pack_b;
 
     if (pack_b) {
-        FT_CHECK(Convert(workspace, src, weight.kernel, dst, nullptr) == 0);
+        FT_CHECK(Convert(workspace, src, weight.kernel, dst, st) == 0);
         sync_check_cuda_error();
         // FT_CHECK(0);
     }
     else {
-        check_cuda_error(cudaMemcpy(weight.kernel, workspace, sizeof(T) * input_dim * output_dim, cudaMemcpyDefault));
+        check_cuda_error(
+            cudaMemcpyAsync(weight.kernel, workspace, sizeof(T) * input_dim * output_dim, cudaMemcpyDefault, st));
     }
 
     weight.k_desc = dst;
 }
 
 template<class T>
-static void convert(LlamaDenseWeight<T>& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt)
+static void
+convert(LlamaDenseWeight<T>& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt, cudaStream_t st)
 {
     if (weight.type == WeightType::kINT4) {
         if constexpr (std::is_same_v<T, half>) {
-            convert_u4(weight, is_fused_moe, workspace, size, use_simt);
+            convert_u4(weight, is_fused_moe, workspace, size, use_simt, st);
         }
         else {
             FT_CHECK(0);
         }
     }
     else {
-        convert_fp(weight, is_fused_moe, workspace, size, use_simt);
+        convert_fp(weight, is_fused_moe, workspace, size, use_simt, st);
     }
 }
 
 template<class T>
-void interleave(LlamaDenseWeight<T>& c, LlamaDenseWeight<T>& a, LlamaDenseWeight<T>& b, void* workspace, size_t size)
+void interleave(LlamaDenseWeight<T>& c,
+                LlamaDenseWeight<T>& a,
+                LlamaDenseWeight<T>& b,
+                void*                workspace,
+                size_t               size,
+                cudaStream_t         st)
 {
     FT_CHECK(c.input_dims == a.input_dims);
     FT_CHECK(c.input_dims == b.input_dims);
@@ -628,18 +573,18 @@ void interleave(LlamaDenseWeight<T>& c, LlamaDenseWeight<T>& a, LlamaDenseWeight
         const auto sentinel = tmp_c + c.output_dims * c.input_dims;
         FT_CHECK(sentinel <= (uint8_t*)workspace + size);
 
-        extend_to_u8(tmp_a, (const uint4_t*)a.kernel, a.output_dims * a.input_dims);
-        extend_to_u8(tmp_b, (const uint4_t*)b.kernel, b.output_dims * b.input_dims);
+        extend_to_u8(tmp_a, (const uint4_t*)a.kernel, a.output_dims * a.input_dims, st);
+        extend_to_u8(tmp_b, (const uint4_t*)b.kernel, b.output_dims * b.input_dims, st);
 
-        interleave_output_dims(tmp_c, tmp_a, tmp_b, a.output_dims, a.input_dims, 0);
+        interleave_output_dims(tmp_c, tmp_a, tmp_b, a.output_dims, a.input_dims, st);
 
-        compact_to_u4((uint4_t*)c.kernel, tmp_c, c.output_dims * c.input_dims);
+        compact_to_u4((uint4_t*)c.kernel, tmp_c, c.output_dims * c.input_dims, st);
 
-        interleave_output_dims(c.scales, a.scales, b.scales, a.output_dims, a.input_dims / a.group_size, 0);
-        interleave_output_dims(c.zeros, a.zeros, b.zeros, a.output_dims, a.input_dims / a.group_size, 0);
+        interleave_output_dims(c.scales, a.scales, b.scales, a.output_dims, a.input_dims / a.group_size, st);
+        interleave_output_dims(c.zeros, a.zeros, b.zeros, a.output_dims, a.input_dims / a.group_size, st);
     }
     else {
-        interleave_output_dims((T*)c.kernel, (const T*)a.kernel, (const T*)b.kernel, a.output_dims, a.input_dims, 0);
+        interleave_output_dims((T*)c.kernel, (const T*)a.kernel, (const T*)b.kernel, a.output_dims, a.input_dims, st);
     }
 
     // Check at function level
@@ -647,7 +592,7 @@ void interleave(LlamaDenseWeight<T>& c, LlamaDenseWeight<T>& a, LlamaDenseWeight
 }
 
 template<class T>
-void chunk(LlamaDenseWeight<T>& c, LlamaDenseWeight<T>& a, LlamaDenseWeight<T>& b, void*, size_t)
+void chunk(LlamaDenseWeight<T>& c, LlamaDenseWeight<T>& a, LlamaDenseWeight<T>& b, void*, size_t, cudaStream_t st)
 {
     FT_CHECK(c.input_dims == a.input_dims);
     FT_CHECK(c.input_dims == b.input_dims);
@@ -656,9 +601,11 @@ void chunk(LlamaDenseWeight<T>& c, LlamaDenseWeight<T>& a, LlamaDenseWeight<T>&
     FT_CHECK(c.group_size == a.group_size);
     FT_CHECK(c.group_size == b.group_size);
 
-    auto _chunks = [](auto c, auto a, auto b, int height, int width) {
-        check_cuda_error(cudaMemcpy2D((char*)c + 0x000, width * 2, a, width, width, height, cudaMemcpyDefault));
-        check_cuda_error(cudaMemcpy2D((char*)c + width, width * 2, b, width, width, height, cudaMemcpyDefault));
+    auto _chunks = [&](auto c, auto a, auto b, int height, int width) {
+        check_cuda_error(
+            cudaMemcpy2DAsync((char*)c + 0x000, width * 2, a, width, width, height, cudaMemcpyDefault, st));
+        check_cuda_error(
+            cudaMemcpy2DAsync((char*)c + width, width * 2, b, width, width, height, cudaMemcpyDefault, st));
     };
 
     if (c.type == WeightType::kINT4) {
@@ -675,37 +622,37 @@ void chunk(LlamaDenseWeight<T>& c, LlamaDenseWeight<T>& a, LlamaDenseWeight<T>&
 }
 
 template<typename T>
-void LlamaDecoderLayerWeight<T>::prepare(void* workspace, size_t size, const cudaDeviceProp& prop)
+void LlamaDecoderLayerWeight<T>::prepare(void* workspace, size_t size, const cudaDeviceProp& prop, cudaStream_t st)
 {
     const bool is_16xx = is_16xx_series(prop.name);
 
-    convert(self_attn_weights.qkv, false, workspace, size, is_16xx);
-    convert(self_attn_weights.output, false, workspace, size, is_16xx);
+    convert(self_attn_weights.qkv, false, workspace, size, is_16xx, st);
+    convert(self_attn_weights.output, false, workspace, size, is_16xx, st);
 
     auto process_ffn = [&](LlamaFfnWeight<T>& ffn, bool is_fused_moe) {
         if (fused_up_and_gate_) {
             auto& fused_up_and_gate = ffn.fused_gating_intermediate;
 
-            mallocWeights(fused_up_and_gate, false);
+            fused_up_and_gate.malloc(st);
 
             if (ffn.is_fused_silu) {
-                interleave(fused_up_and_gate, ffn.gating, ffn.intermediate, workspace, size);
+                interleave(fused_up_and_gate, ffn.gating, ffn.intermediate, workspace, size, st);
             }
             else {
-                chunk(fused_up_and_gate, ffn.gating, ffn.intermediate, workspace, size);
+                chunk(fused_up_and_gate, ffn.gating, ffn.intermediate, workspace, size, st);
             }
 
-            convert(ffn.fused_gating_intermediate, is_fused_moe, workspace, size, is_16xx);
+            convert(ffn.fused_gating_intermediate, is_fused_moe, workspace, size, is_16xx, st);
 
-            freeWeights(ffn.gating);
-            freeWeights(ffn.intermediate);
+            ffn.gating.free(st);
+            ffn.intermediate.free(st);
         }
         else {
-            convert(ffn.gating, is_fused_moe, workspace, size, is_16xx);
-            convert(ffn.intermediate, is_fused_moe, workspace, size, is_16xx);
+            convert(ffn.gating, is_fused_moe, workspace, size, is_16xx, st);
+            convert(ffn.intermediate, is_fused_moe, workspace, size, is_16xx, st);
         }
 
-        convert(ffn.output, is_fused_moe, workspace, size, is_16xx);
+        convert(ffn.output, is_fused_moe, workspace, size, is_16xx, st);
     };
 
     if (inter_size_) {
@@ -722,7 +669,7 @@ void LlamaDecoderLayerWeight<T>::prepare(void* workspace, size_t size, const cud
 
         for (auto& e : moe_weights.experts) {
 
-            process_ffn(e, moe_weights.method);
+            process_ffn(e, moe_weights.method == MoeParam::kFused);
 
             const auto& fused  = e.fused_gating_intermediate;
             const auto& output = e.output;
@@ -743,12 +690,12 @@ void LlamaDecoderLayerWeight<T>::prepare(void* workspace, size_t size, const cud
         auto& output = moe_weights.block.output;
 
         // TODO: free these ptrs
-        fused.kernel  = gemm::make_blocked_ptrs(fused_ptrs, nullptr);
-        output.kernel = gemm::make_blocked_ptrs(output_ptrs, nullptr);
+        fused.kernel  = gemm::make_blocked_ptrs(fused_ptrs, st);
+        output.kernel = gemm::make_blocked_ptrs(output_ptrs, st);
 
         if (!fused_param_ptrs.empty()) {
-            fused.scales_zeros  = (T*)gemm::make_blocked_ptrs(fused_param_ptrs, nullptr);
-            output.scales_zeros = (T*)gemm::make_blocked_ptrs(output_param_ptrs, nullptr);
+            fused.scales_zeros  = (T*)gemm::make_blocked_ptrs(fused_param_ptrs, st);
+            output.scales_zeros = (T*)gemm::make_blocked_ptrs(output_param_ptrs, st);
         }
 
         fused.k_desc.ld = output.k_desc.ld = 0;
diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
index f68a103dd5..9b204ed0dc 100644
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
@@ -30,19 +30,14 @@ template<typename T>
 struct LlamaDecoderLayerWeight {
 public:
     LlamaDecoderLayerWeight() = delete;
-    LlamaDecoderLayerWeight(int        layer_idx,
-                            size_t     head_num,
-                            size_t     kv_head_num,
-                            size_t     size_per_head,
-                            size_t     hidden_units,
-                            size_t     inter_size,
-                            WeightType weight_type,
-                            int        group_size,
-                            LoraParam  lora_param,
-                            bool       attn_bias,
-                            MoeParam   moe_param,
-                            size_t     tensor_para_size,
-                            size_t     tensor_para_rank);
+
+    LlamaDecoderLayerWeight(int               layer_id,
+                            const ModelParam& model,
+                            const LoraParam&  lora_param,
+                            const MoeParam&   moe_param,
+                            size_t            tp_size,
+                            size_t            tp_rank);
+
     ~LlamaDecoderLayerWeight();
     LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other) = delete;
     LlamaDecoderLayerWeight& operator=(const LlamaDecoderLayerWeight& other) = delete;
@@ -51,17 +46,21 @@ struct LlamaDecoderLayerWeight {
 
     TensorMap getParams(std::string prefix);
 
-    void prepare(void* workspace, size_t size, const cudaDeviceProp& prop);
+    void prepare(void* workspace, size_t size, const cudaDeviceProp& prop, cudaStream_t st);
 
     size_t workspace_size() const noexcept;
 
-    void mallocWeights(LlamaDenseWeight<T>& weights, bool bias);
+    void malloc(cudaStream_t st);
+
+    void free(cudaStream_t st);
+
+    T* self_attn_norm_weights{};
+    T* ffn_norm_weights{};
 
-    T*                      self_attn_norm_weights{};
-    T*                      ffn_norm_weights{};
     LlamaAttentionWeight<T> self_attn_weights{};
-    LlamaFfnWeight<T>       ffn_weights{};
-    MoeFfnWeight<T>         moe_weights{};
+
+    LlamaFfnWeight<T> ffn_weights{};
+    MoeFfnWeight<T>   moe_weights{};
 
 private:
     size_t     head_num_;
@@ -76,8 +75,6 @@ struct LlamaDecoderLayerWeight {
     size_t     tensor_para_rank_;
     bool       is_maintain_buffer_ = false;
     bool       fused_up_and_gate_;
-
-    void mallocWeights();
 };
 
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/LlamaDenseWeight.h b/src/turbomind/models/llama/LlamaDenseWeight.h
index 169fb53bcf..944781bf5d 100644
--- a/src/turbomind/models/llama/LlamaDenseWeight.h
+++ b/src/turbomind/models/llama/LlamaDenseWeight.h
@@ -20,64 +20,14 @@
 #pragma once
 
 #include "src/turbomind/kernels/gemm/types.h"
+#include "src/turbomind/models/llama/llama_params.h"
+#include "src/turbomind/models/llama/weight_type.h"
 #include "src/turbomind/utils/cuda_utils.h"
+#include "src/turbomind/utils/memory_utils.h"
 #include <cuda_bf16.h>
 
 namespace turbomind {
 
-enum class WeightType : int
-{
-    kFP32,
-    kFP16,
-    kFP8,  // not supported yet
-    kBF16,
-    kINT8,
-    kINT4
-};
-
-template<class T>
-constexpr WeightType get_default_weight_type()
-{
-    if constexpr (std::is_same_v<T, half>) {
-        return WeightType::kFP16;
-    }
-    else if constexpr (std::is_same_v<T, nv_bfloat16>) {
-        return WeightType::kBF16;
-    }
-    else if constexpr (std::is_same_v<T, float>) {
-        return WeightType::kFP32;
-    }
-    else {
-        static_assert(sizeof(T) != sizeof(T), "not implemented");
-        return {};
-    }
-}
-
-inline size_t getBitSize(WeightType type)
-{
-    switch (type) {
-        case WeightType::kFP32:
-            return 32;
-        case WeightType::kFP16:
-            return 16;
-        case WeightType::kFP8:
-            return 8;
-        case WeightType::kBF16:
-            return 16;
-        case WeightType::kINT8:
-            return 8;
-        case WeightType::kINT4:
-            return 4;
-    }
-    return 0;
-}
-
-enum class LoraPolicy : int
-{
-    kNull,
-    kPlora,
-};
-
 inline LoraPolicy getLoraPolicy(const std::string& policy)
 {
     if (policy == "plora") {
@@ -96,20 +46,31 @@ struct LoraWeight {
 
 template<typename T>
 struct LlamaDenseWeight {
-    size_t     input_dims;
-    size_t     output_dims;
-    void*      kernel;
+    size_t     input_dims  = 0;
+    size_t     output_dims = 0;
+    WeightType type;  // uninitialized
+    void*      kernel       = nullptr;
+    T*         bias         = nullptr;
+    T*         scales       = nullptr;
+    T*         zeros        = nullptr;
+    T*         scales_zeros = nullptr;
+    int        group_size   = 1;
+
     LoraWeight lora;
-    WeightType type;
-    T*         bias;
-    T*         scales;
-    T*         zeros;
-    T*         scales_zeros;
-    int        group_size;
 
     gemm::MatrixLayout k_desc;
     gemm::MatrixLayout q_desc;
 
+    LlamaDenseWeight(): type{}, lora{}, k_desc{}, q_desc{} {}
+
+    LlamaDenseWeight(size_t input_dim, size_t output_dim, WeightType type, int group_size): LlamaDenseWeight{}
+    {
+        this->input_dims  = input_dim;
+        this->output_dims = output_dim;
+        this->type        = type;
+        this->group_size  = group_size;
+    }
+
     size_t kernel_size() const noexcept
     {
         return getBitSize(type) * input_dims * output_dims / 8;
@@ -129,12 +90,121 @@ struct LlamaDenseWeight {
     {
         return {sizeof(T) * input_dims * lora.r, sizeof(T) * lora.r * output_dims};
     }
+
+    void malloc(cudaStream_t st, bool with_bias = false)
+    {
+        if (with_bias) {
+            deviceMalloc((T**)&bias, output_dims, st);
+        }
+        const size_t bit_size = getBitSize(type);
+        if (bit_size >= 16) {  // fp16, fp32
+            deviceMalloc((T**)&kernel, input_dims * output_dims, st);
+        }
+        else {  // int8, int4
+            const int factor = sizeof(float) * 8 / bit_size;
+            FT_CHECK(input_dims % factor == 0);
+            deviceMalloc((int**)&kernel, input_dims * output_dims / factor, st);
+            deviceMalloc((T**)&scales, input_dims / group_size * output_dims, st);
+            deviceMalloc((T**)&zeros, input_dims / group_size * output_dims, st);
+        }
+
+        if (lora.r > 0) {
+            deviceMalloc((T**)&lora.a, input_dims * lora.r, st);
+            deviceMalloc((T**)&lora.b, lora.r * output_dims, st);
+        }
+    }
+
+    void free(cudaStream_t st)
+    {
+        deviceFree(kernel, st);
+        deviceFree(bias, st);
+        deviceFree(scales, st);
+        deviceFree(zeros, st);
+        deviceFree(lora.a, st);
+        deviceFree(lora.b, st);
+    }
 };
 
 template<typename T>
 struct LlamaAttentionWeight {
+
+    LlamaAttentionWeight() = default;
+
+    LlamaAttentionWeight(size_t     hidden_dim,
+                         size_t     head_dim,
+                         size_t     head_num,
+                         size_t     kv_head_num,
+                         MLAParam   mla,
+                         bool       bias,
+                         size_t     tp,
+                         WeightType weight_type,
+                         int        group_size)
+    {
+        this->bias = bias;
+        if (mla.kv_lora_rank == 0) {
+            qkv = {hidden_dim, (head_num + 2 * kv_head_num) * head_dim / tp, weight_type, group_size};
+        }
+        else {
+            const int qk_nope_dim = head_dim - mla.qk_rope_dim;
+            if (mla.q_lora_rank) {
+                q_a_proj = {hidden_dim, mla.q_lora_rank, weight_type, group_size};
+                q_b_proj = {mla.q_lora_rank, head_num * head_dim / tp, weight_type, group_size};
+            }
+            else {
+                q_proj = {hidden_dim, head_num * head_dim / tp, weight_type, group_size};
+            }
+            kv_a_proj = {hidden_dim, mla.kv_lora_rank + mla.qk_rope_dim, weight_type, group_size};
+            kv_b_proj = {mla.kv_lora_rank, head_num * (qk_nope_dim + mla.v_head_dim) / tp, weight_type, group_size};
+        }
+        output = {(head_num * head_dim) / tp, hidden_dim, weight_type, group_size};
+    }
+
+    void malloc(cudaStream_t st)
+    {
+        if (qkv.output_dims) {
+            qkv.malloc(st, bias);
+        }
+        else {
+            if (q_proj.output_dims) {
+                q_proj.malloc(st);
+            }
+            else {
+                q_a_proj.malloc(st);
+                q_b_proj.malloc(st);
+                deviceMalloc((T**)&q_a_layernorm, q_b_proj.input_dims, st);
+            }
+            kv_a_proj.malloc(st);
+            kv_b_proj.malloc(st);
+            deviceMalloc((T**)&kv_a_layernorm, kv_b_proj.input_dims, st);
+        }
+        output.malloc(st, bias);
+    }
+
+    void free(cudaStream_t st)
+    {
+        qkv.free(st);
+        q_proj.free(st);
+        q_a_proj.free(st);
+        q_b_proj.free(st);
+        kv_a_proj.free(st);
+        kv_b_proj.free(st);
+        output.free(st);
+        deviceFree(q_a_layernorm, st);
+        deviceFree(kv_a_layernorm, st);
+    }
+
     LlamaDenseWeight<T> qkv;
     LlamaDenseWeight<T> output;
+    bool                bias{};
+
+    LlamaDenseWeight<T> q_proj;
+    LlamaDenseWeight<T> q_a_proj;
+    LlamaDenseWeight<T> q_b_proj;
+    LlamaDenseWeight<T> kv_a_proj;
+    LlamaDenseWeight<T> kv_b_proj;
+
+    T* q_a_layernorm{};
+    T* kv_a_layernorm{};
 };
 
 template<typename T>
@@ -172,6 +242,21 @@ struct LlamaFfnWeight {
         output.group_size  = group_size;
     }
 
+    void malloc(cudaStream_t st)
+    {
+        gating.malloc(st);
+        intermediate.malloc(st);
+        output.malloc(st);
+    }
+
+    void free(cudaStream_t st)
+    {
+        gating.free(st);
+        intermediate.free(st);
+        output.free(st);
+        fused_gating_intermediate.free(st);
+    }
+
     LlamaDenseWeight<T> gating;
     LlamaDenseWeight<T> intermediate;
     LlamaDenseWeight<T> output;
@@ -186,23 +271,27 @@ struct MoeFfnWeight {
 
     MoeFfnWeight() = default;
 
-    MoeFfnWeight(size_t     hidden_dim,
-                 int        inter_size,
-                 int        expert_num,
-                 int        method,
-                 bool       has_shared_gate,
-                 size_t     tp,
-                 WeightType weight_type,
-                 int        group_size,
-                 bool       fuse_silu_act)
+    MoeFfnWeight(int             layer_id,
+                 const MoeParam& param,
+                 size_t          hidden_dim,
+                 WeightType      weight_type,
+                 int             group_size,
+                 size_t          tp,
+                 bool            fuse_silu_act)
     {
 
-        // printf("%d %d %d\n", (int)hidden_dim, (int)inter_size, (int)expert_num);
+        if (param.expert_num.size() <= layer_id) {
+            return;
+        }
+
+        const int expert_num = param.expert_num[layer_id];
 
         if (expert_num == 0) {
             return;
         }
 
+        // printf("%d %d %d\n", (int)hidden_dim, (int)param.inter_size, (int)expert_num);
+
         gate.input_dims  = hidden_dim;
         gate.output_dims = expert_num;
         gate.type        = get_default_weight_type<T>();
@@ -210,15 +299,15 @@ struct MoeFfnWeight {
 
         experts.resize(expert_num);
 
-        this->method  = method;
-        fuse_silu_act = fuse_silu_act && method;
+        method        = param.method;
+        fuse_silu_act = fuse_silu_act && method == MoeParam::kFused;
 
         for (auto& e : experts) {
             // inter size is divided by tp in `FfnWeight`
-            e = LlamaFfnWeight<T>{hidden_dim, (size_t)inter_size, tp, weight_type, group_size, fuse_silu_act};
+            e = LlamaFfnWeight<T>{hidden_dim, (size_t)param.inter_size, tp, weight_type, group_size, fuse_silu_act};
         }
 
-        if (has_shared_gate) {
+        if (param.shared_gate) {
             shared_gate.input_dims  = hidden_dim;
             shared_gate.output_dims = 1;
             shared_gate.type        = get_default_weight_type<T>();
@@ -229,14 +318,36 @@ struct MoeFfnWeight {
         }
     }
 
+    void malloc(cudaStream_t st)
+    {
+        gate.malloc(st);
+        if (shared_gate.output_dims) {
+            shared_gate.malloc(st);
+        }
+        for (auto& e : experts) {
+            e.malloc(st);
+        }
+    }
+
+    void free(cudaStream_t st)
+    {
+        gate.free(st);
+        shared_gate.free(st);
+        for (auto& e : experts) {
+            e.free(st);
+        }
+        block.free(st);
+    }
+
     LlamaDenseWeight<T>            gate;
     std::vector<LlamaFfnWeight<T>> experts;
 
     LlamaDenseWeight<T> shared_gate;
 
+    // reference into `experts`
     LlamaFfnWeight<T> block;
 
-    int method{};
+    MoeParam::Method method{};
 };
 
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/LlamaFfnLayer.cc b/src/turbomind/models/llama/LlamaFfnLayer.cc
index 8cce207203..907467341a 100644
--- a/src/turbomind/models/llama/LlamaFfnLayer.cc
+++ b/src/turbomind/models/llama/LlamaFfnLayer.cc
@@ -27,21 +27,20 @@
 namespace turbomind {
 
 template<typename T>
-void LlamaFfnLayer<T>::allocateBuffer(size_t                     token_num,
-                                      int                        inter_size,
-                                      const LlamaDenseWeight<T>* gating,
-                                      const LlamaDenseWeight<T>* inter)
+void LlamaFfnLayer<T>::allocateBuffer(
+    size_t token_num, int inter_size, size_t inter_buf_factor, size_t gating_lora_r, size_t inter_lora_r)
 {
     const size_t sz = token_num * inter_size;
 
-    const size_t sz_gate  = token_num * gating->lora.r;
-    const size_t sz_inter = token_num * inter->lora.r;
+    const size_t sz_gate  = token_num * gating_lora_r;
+    const size_t sz_inter = token_num * inter_lora_r;
 
-    gating_buf_ = (T*)allocator_->reMalloc(gating_buf_, sizeof(T) * (sz * 2 + sz_gate + sz_inter), false);
-    inter_buf_  = gating_buf_ + sz;
+    gating_buf_ =
+        (T*)allocator_->reMalloc(gating_buf_, sizeof(T) * (sz * inter_buf_factor + sz_gate + sz_inter), false);
+    inter_buf_ = gating_buf_ + sz;
 
     // gate & inter is not fused when lora is enabled
-    if (gating->lora.r) {
+    if (gating_lora_r) {
         inter_buf_ += sz_gate;
     }
 
@@ -93,12 +92,16 @@ void LlamaFfnLayer<T>::forward(TensorMap*               output_tensors,
     const int    layer_id   = input_tensors->getVal<int>("layer_id");
     const int    inter_size = weights->inter_size;
 
-    allocateBuffer(token_num, inter_size, &weights->gating, &weights->intermediate);
+    const bool is_fused_silu = weights->fused_gating_intermediate.kernel && weights->is_fused_silu;
+
+    allocateBuffer(token_num, inter_size, is_fused_silu ? 1 : 2, weights->gating.lora.r, weights->intermediate.lora.r);
 
     const T* ffn_input_data  = input_tensors->at("ffn_input").getPtr<T>();
     T*       ffn_output_data = output_tensors->at("ffn_output").getPtr<T>();
     int*     lora_mask = input_tensors->at("lora_mask", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr<int>();
 
+    const bool all_reduce = input_tensors->getVal<bool>("all_reduce", false);
+
     if (weights->fused_gating_intermediate.kernel) {
         NvtxScope scope("fused_silu_ffn");
 
@@ -145,7 +148,8 @@ void LlamaFfnLayer<T>::forward(TensorMap*               output_tensors,
 
     count_and_fix(ffn_output_data, token_num * weights->output.output_dims, Concat("w2", layer_id), 3);
 
-    if (all_reduce_ && tensor_para_.world_size_ > 1) {
+    if (all_reduce && tensor_para_.world_size_ > 1) {
+        // std::cout << "ffn all reduce " << layer_id << "\n";
         NcclGuard nccl_guard(tensor_para_, stream_);
         ftNcclAllReduceSum(ffn_output_data, ffn_output_data, token_num * hidden_units_, tensor_para_, stream_);
         sync_check_cuda_error();
diff --git a/src/turbomind/models/llama/LlamaFfnLayer.h b/src/turbomind/models/llama/LlamaFfnLayer.h
index 2daca2cc95..a72a24701e 100644
--- a/src/turbomind/models/llama/LlamaFfnLayer.h
+++ b/src/turbomind/models/llama/LlamaFfnLayer.h
@@ -30,13 +30,12 @@ namespace turbomind {
 template<typename T>
 class LlamaFfnLayer {
 public:
-    LlamaFfnLayer(const ModelParam& model, const NcclParam& tp, const Context<T>& ctx, bool all_reduce):
+    LlamaFfnLayer(const ModelParam& model, const NcclParam& tp, const Context<T>& ctx):
         hidden_units_(model.hidden_units),
         tensor_para_(tp),
         stream_(ctx.stream),
         linear_(ctx.linear.get()),
-        allocator_(ctx.allocator.get()),
-        all_reduce_(all_reduce)
+        allocator_(ctx.allocator.get())
     {
     }
 
@@ -48,7 +47,8 @@ class LlamaFfnLayer {
     void forward(TensorMap* output_tensors, const TensorMap* input_tensors, const LlamaFfnWeight<T>* weights);
 
 private:
-    void allocateBuffer(size_t token_num, int inter_size, const LlamaDenseWeight<T>*, const LlamaDenseWeight<T>*);
+    void allocateBuffer(
+        size_t token_num, int inter_size, size_t inter_buf_factor, size_t gating_lora_r, size_t inter_lora_r);
 
     void freeBuffer();
 
@@ -59,7 +59,6 @@ class LlamaFfnLayer {
     cudaStream_t const    stream_;
     LlamaLinear<T>* const linear_;
     IAllocator* const     allocator_;
-    const bool            all_reduce_;
     bool                  is_free_buffer_after_forward_{};
 
     T* gating_buf_{};
diff --git a/src/turbomind/models/llama/LlamaV2.cc b/src/turbomind/models/llama/LlamaV2.cc
index 3d50910ad4..05b22deed5 100644
--- a/src/turbomind/models/llama/LlamaV2.cc
+++ b/src/turbomind/models/llama/LlamaV2.cc
@@ -72,7 +72,6 @@ LlamaV2<T>::LlamaV2(const ModelParam&               model,
     lora_param_(lora),
     head_num_(model.head_num),
     size_per_head_(model.head_dim),
-    inter_size_(model.inter_size),
     hidden_units_(model.hidden_units),
     layer_num_(model.layer_num),
     vocab_size_(model.vocab_size),
diff --git a/src/turbomind/models/llama/LlamaV2.h b/src/turbomind/models/llama/LlamaV2.h
index 6321d09d7c..658282f5e5 100644
--- a/src/turbomind/models/llama/LlamaV2.h
+++ b/src/turbomind/models/llama/LlamaV2.h
@@ -113,7 +113,6 @@ class LlamaV2 {
     const size_t    head_num_;
     const size_t    size_per_head_;
     const size_t    hidden_units_;
-    const size_t    inter_size_;
     const size_t    layer_num_;
     const size_t    vocab_size_;
     const size_t    vocab_size_padded_;
diff --git a/src/turbomind/models/llama/LlamaWeight.cc b/src/turbomind/models/llama/LlamaWeight.cc
index 9d62042d62..bcee150977 100644
--- a/src/turbomind/models/llama/LlamaWeight.cc
+++ b/src/turbomind/models/llama/LlamaWeight.cc
@@ -20,36 +20,24 @@
 
 #include "src/turbomind/models/llama/LlamaWeight.h"
 #include "src/turbomind/models/llama/llama_params.h"
+#include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/memory_utils.h"
 #include <cuda_runtime.h>
 
 namespace turbomind {
 
 template<typename T>
-LlamaWeight<T>::LlamaWeight(size_t     head_num,
-                            size_t     kv_head_num,
-                            size_t     size_per_head,
-                            size_t     hidden_units,
-                            size_t     inter_size,
-                            size_t     vocab_size,
-                            size_t     embedding_size,
-                            size_t     num_layer,
-                            bool       attn_bias,
-                            WeightType weight_type,
-                            int        group_size,
-                            LoraParam  lora_param,
-                            MoeParam   moe_param,
-                            size_t     tensor_para_size,
-                            size_t     tensor_para_rank):
-    hidden_units_(hidden_units),
-    inter_size_(inter_size),
-    vocab_size_(vocab_size),
-    vocab_size_padded_(vocab_size),
-    embedding_size_(embedding_size),
-    num_layer_(num_layer),
-    weight_type_(weight_type),
-    tensor_para_size_(tensor_para_size),
-    tensor_para_rank_(tensor_para_rank)
+LlamaWeight<T>::LlamaWeight(
+    const ModelParam& model, const LoraParam& lora_param, const MoeParam& moe_param, size_t tp_size, size_t tp_rank):
+    hidden_units_(model.hidden_units),
+    inter_size_(model.inter_size),
+    vocab_size_(model.vocab_size),
+    vocab_size_padded_(model.vocab_size),
+    embedding_size_(model.embedding_size),
+    num_layer_(model.layer_num),
+    weight_type_(model.weight_type),
+    tensor_para_size_(tp_size),
+    tensor_para_rank_(tp_rank)
 {
     if (vocab_size_padded_ % tensor_para_size_ != 0) {
         vocab_size_padded_ = (vocab_size_ + tensor_para_size_ - 1) / tensor_para_size_ * tensor_para_size_;
@@ -61,49 +49,42 @@ LlamaWeight<T>::LlamaWeight(size_t     head_num,
     }
     FT_CHECK(hidden_units_ % tensor_para_size_ == 0);
 
+    check_cuda_error(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking));
+
     decoder_layer_weights.reserve(num_layer_);
     for (unsigned l = 0; l < num_layer_; ++l) {
-        decoder_layer_weights.push_back(new LlamaDecoderLayerWeight<T>(l,
-                                                                       head_num,
-                                                                       kv_head_num,
-                                                                       size_per_head,
-                                                                       hidden_units_,
-                                                                       inter_size_,
-                                                                       weight_type_,
-                                                                       group_size,
-                                                                       lora_param,
-                                                                       attn_bias,
-                                                                       moe_param,
-                                                                       tensor_para_size_,
-                                                                       tensor_para_rank_));
+        decoder_layer_weights.emplace_back(
+            new LlamaDecoderLayerWeight<T>(l, model, lora_param, moe_param, tp_size, tp_rank));
+        decoder_layer_weights.back()->malloc(stream_);
     }
 
-    mallocWeights();
+    FT_CHECK(vocab_size_padded_ % tensor_para_size_ == 0);
+    deviceMalloc((T**)&pre_decoder_embedding_table, embedding_size_ * hidden_units_ / tensor_para_size_, stream_);
+    deviceMalloc((T**)&output_norm_weight, hidden_units_, stream_);
+    deviceMalloc((T**)&post_decoder_embedding_kernel, hidden_units_ * vocab_size_padded_ / tensor_para_size_, stream_);
+
+    // Wait for allocations
+    check_cuda_error(cudaStreamSynchronize(stream_));
 }
 
 template<typename T>
 LlamaWeight<T>::~LlamaWeight()
 {
-    cudaFree((void*)pre_decoder_embedding_table);
-    cudaFree((void*)output_norm_weight);
-    cudaFree((void*)post_decoder_embedding_kernel);
-
-    pre_decoder_embedding_table   = nullptr;
-    output_norm_weight            = nullptr;
-    post_decoder_embedding_kernel = nullptr;
+    deviceFree(pre_decoder_embedding_table, stream_);
+    deviceFree(output_norm_weight, stream_);
+    deviceFree(post_decoder_embedding_kernel, stream_);
 
     for (auto& p : decoder_layer_weights) {
+        p->free(stream_);
         delete p;
     }
-}
 
-template<typename T>
-void LlamaWeight<T>::mallocWeights()
-{
-    FT_CHECK(vocab_size_padded_ % tensor_para_size_ == 0);
-    deviceMalloc((T**)&pre_decoder_embedding_table, embedding_size_ * hidden_units_ / tensor_para_size_);
-    deviceMalloc((T**)&output_norm_weight, hidden_units_);
-    deviceMalloc((T**)&post_decoder_embedding_kernel, hidden_units_ * vocab_size_padded_ / tensor_para_size_);
+    decoder_layer_weights.clear();
+
+    // Wait for deallocations
+    check_cuda_error(cudaStreamSynchronize(stream_));
+    check_cuda_error(cudaStreamDestroy(stream_));
+    stream_ = {};
 }
 
 template<typename T>
@@ -179,13 +160,19 @@ void LlamaWeight<T>::prepare(const cudaDeviceProp& prop)
 
     TM_LOG_INFO("[LlamaWeight<T>::prepare] workspace size: %d\n", workspace_size);
 
+    // Wait for the weights to be filled externally
+    check_cuda_error(cudaDeviceSynchronize());
+
     if (workspace_size) {
-        deviceMalloc((char**)&workspace, workspace_size);
+        deviceMalloc((char**)&workspace, workspace_size, stream_);
     }
     for (auto& layer : decoder_layer_weights) {
-        layer->prepare(workspace, workspace_size, prop);
+        layer->prepare(workspace, workspace_size, prop, stream_);
     }
-    deviceFree(workspace);
+
+    deviceFree(workspace, stream_);
+
+    check_cuda_error(cudaStreamSynchronize(stream_));
 }
 
 #ifdef ENABLE_FP32
diff --git a/src/turbomind/models/llama/LlamaWeight.h b/src/turbomind/models/llama/LlamaWeight.h
index c30e753565..629cd56120 100644
--- a/src/turbomind/models/llama/LlamaWeight.h
+++ b/src/turbomind/models/llama/LlamaWeight.h
@@ -22,28 +22,18 @@
 
 #include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
 #include "src/turbomind/models/llama/llama_params.h"
-#include "src/turbomind/utils/memory_utils.h"
 
 namespace turbomind {
 
 template<typename T>
 struct LlamaWeight {
     LlamaWeight() = default;
-    LlamaWeight(size_t     head_num,
-                size_t     kv_head_num,
-                size_t     size_per_head,
-                size_t     hidden_units,
-                size_t     inter_size,
-                size_t     vocab_size,
-                size_t     embedding_size,
-                size_t     num_layer,
-                bool       attn_bias,
-                WeightType weight_type,
-                int        group_size,
-                LoraParam  lora_param,
-                MoeParam   moe_param,
-                size_t     tensor_para_size,
-                size_t     tensor_para_rank);
+
+    LlamaWeight(const ModelParam& model_param,
+                const LoraParam&  lora_param,
+                const MoeParam&   moe_param,
+                size_t            tp_size,
+                size_t            tp_rank);
 
     ~LlamaWeight();
 
@@ -57,15 +47,13 @@ struct LlamaWeight {
     void prepare(const cudaDeviceProp& prop);
 
     std::vector<LlamaDecoderLayerWeight<T>*> decoder_layer_weights;
-    const T*                                 pre_decoder_embedding_table{};
-    const T*                                 output_norm_weight{};
-    const T*                                 post_decoder_embedding_kernel{};
 
-private:
-    void mallocWeights();
+    T* pre_decoder_embedding_table{};
+    T* output_norm_weight{};
+    T* post_decoder_embedding_kernel{};
 
+private:
     size_t     hidden_units_;
-    size_t     inter_size_;
     size_t     vocab_size_;
     size_t     vocab_size_padded_;
     size_t     embedding_size_;
@@ -73,6 +61,10 @@ struct LlamaWeight {
     WeightType weight_type_;
     size_t     tensor_para_size_;
     size_t     tensor_para_rank_;
+
+    std::vector<int> inter_size_;
+
+    cudaStream_t stream_;
 };
 
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/llama_gemm.cc b/src/turbomind/models/llama/llama_gemm.cc
index 62952cd715..f9a0191e4b 100644
--- a/src/turbomind/models/llama/llama_gemm.cc
+++ b/src/turbomind/models/llama/llama_gemm.cc
@@ -84,7 +84,7 @@ int main(int argc, char* argv[])
         return -1;
     }
     else {
-        ft::deviceMalloc(reinterpret_cast<char**>(&gemm_test_buf), buf_size_in_byte, false);
+        ft::deviceMalloc(reinterpret_cast<char**>(&gemm_test_buf), buf_size_in_byte, nullptr, false);
     }
 
     if (0) {}
diff --git a/src/turbomind/models/llama/llama_kernels.h b/src/turbomind/models/llama/llama_kernels.h
index 3b01dee60d..aaade1a513 100644
--- a/src/turbomind/models/llama/llama_kernels.h
+++ b/src/turbomind/models/llama/llama_kernels.h
@@ -154,7 +154,7 @@ template<typename T>
 struct TempBuffer {
     TempBuffer(size_t size)
     {
-        deviceMalloc(&data, size, false);
+        cudaMalloc(&data, size);
     }
     T* data;
 };
diff --git a/src/turbomind/models/llama/llama_params.h b/src/turbomind/models/llama/llama_params.h
index e6b9d690ae..0a505b11a9 100644
--- a/src/turbomind/models/llama/llama_params.h
+++ b/src/turbomind/models/llama/llama_params.h
@@ -2,28 +2,41 @@
 
 #pragma once
 
-#include "src/turbomind/models/llama/LlamaDenseWeight.h"
 #include <cstddef>
 #include <map>
 #include <regex>
 #include <string>
 
+#include "src/turbomind/models/llama/weight_type.h"
+
 namespace turbomind {
 
+struct MLAParam {
+    size_t q_lora_rank;
+    size_t kv_lora_rank;
+    size_t qk_rope_dim;
+    size_t v_head_dim;
+};
+
 struct ModelParam {
-    size_t head_num;
-    size_t head_dim;
-    size_t kv_head_num;
-    size_t hidden_units;
-    size_t layer_num;
-    size_t inter_size;
-    size_t vocab_size;
-    size_t embedding_size;
-    float  norm_eps;
-    int    quant_policy;
-    //
-    int start_id;
-    int end_id;
+    size_t     head_num;
+    size_t     head_dim;
+    size_t     kv_head_num;
+    size_t     hidden_units;
+    size_t     layer_num;
+    size_t     vocab_size;
+    size_t     embedding_size;
+    float      norm_eps;
+    int        quant_policy;
+    bool       attn_bias;
+    WeightType weight_type;
+    int        group_size;
+    int        start_id;
+    int        end_id;
+    MLAParam   mla;
+    int        tune_layer_num;
+
+    std::vector<int> inter_size;
 };
 
 struct MoeParam {
@@ -32,17 +45,25 @@ struct MoeParam {
         kNaive,
         kFused
     } method;
-    int  expert_num;
-    int  experts_per_token;
-    int  inter_size;
-    bool norm_topk;
-    bool shared_gate;
+
+    int   experts_per_token;
+    int   inter_size;
+    bool  norm_topk_prob;
+    bool  shared_gate;
+    float routed_scale;
+
+    int         topk_group;
+    std::string topk_method;
+    int         n_group;
+
+    std::vector<int> expert_num;
 };
 
 struct AttentionParam {
     int         rotary_embedding_dim;
     float       rotary_embedding_base;
     int         max_position_embeddings;
+    float       softmax_scale;
     std::string rope_scaling_type;
     int         original_max_position_embeddings;
     float       rope_scaling_factor;
@@ -74,6 +95,12 @@ struct EngineParam {
     int max_prefill_iters;
 };
 
+enum class LoraPolicy : int
+{
+    kNull,
+    kPlora,
+};
+
 struct LoraParam {
     int        r;
     float      scale;
diff --git a/src/turbomind/models/llama/llama_utils.cu b/src/turbomind/models/llama/llama_utils.cu
index 925c6b8831..eaa450ae20 100644
--- a/src/turbomind/models/llama/llama_utils.cu
+++ b/src/turbomind/models/llama/llama_utils.cu
@@ -1,47 +1,25 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 
-#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
-#include "src/turbomind/models/llama/llama_utils.h"
-#include "src/turbomind/utils/cuda_utils.h"
 #include <cmath>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <type_traits>
+#include <vector>
+
 #include <cuda_fp16.h>
 #include <curand_kernel.h>
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 #include <thrust/host_vector.h>
-#include <vector>
+
+#include "src/turbomind/models/llama/llama_utils.h"
+#include "src/turbomind/utils/cuda_utils.h"
 
 namespace turbomind {
 
 CmpMode compare_mode = kCmpRead;
-
-template<typename T>
-struct abs_diff_t {
-    using type = T;
-};
-
-template<>
-struct abs_diff_t<half> {
-    using type = float;
-};
-
-template<>
-struct abs_diff_t<__nv_bfloat16> {
-    using type = float;
-};
-
-template<typename T>
-struct abs_diff: public thrust::unary_function<thrust::tuple<T, T>, typename abs_diff_t<T>::type> {
-    __host__ __device__ float operator()(thrust::tuple<T, T> x) const
-    {
-        using R = typename abs_diff_t<T>::type;
-        auto r  = R(thrust::get<0>(x)) - R(thrust::get<1>(x));
-        return r < R(0) ? -r : r;
-    }
-};
+// CmpMode compare_mode = kCmpWrite;
 
 template<typename T>
 void CheckNan(const T* ptr, size_t size, std::string key, cudaStream_t stream)
@@ -63,10 +41,8 @@ void CheckNan(const T* ptr, size_t size, std::string key, cudaStream_t stream)
 template<typename T>
 void CmpRead(T* ptr, size_t size, std::string key, cudaStream_t stream)
 {
-    // wait for b
-    check_cuda_error(cudaStreamSynchronize(stream));
     // read a from file
-    thrust::host_vector<T> h_a(size);
+    std::vector<T> h_a(size);
     {
         const auto    filename = "tmp/" + key + ".cmp";
         std::ifstream ifs(filename, std::ios::binary);
@@ -85,15 +61,30 @@ void CmpRead(T* ptr, size_t size, std::string key, cudaStream_t stream)
         }
         ifs.read((char*)h_a.data(), sizeof(T) * h_a.size());
     }
-    // copy a to device
-    thrust::device_vector<T> a = h_a;
-    // create abs(a - b) iterator
-    thrust::device_ptr<T> dev_ptr(ptr);
-    auto                  zip_iter       = thrust::make_zip_iterator(thrust::make_tuple(a.begin(), dev_ptr));
-    auto                  transform_iter = thrust::make_transform_iterator(zip_iter, abs_diff<T>{});
-    // sum(abs(a - b))
-    auto asum = thrust::reduce(thrust::device, transform_iter, transform_iter + size);
-    std::cerr << key << ": " << asum << " " << asum / size << "\n";
+    std::vector<T> h_b(size);
+    check_cuda_error(cudaMemcpyAsync(h_b.data(), ptr, sizeof(T) * size, cudaMemcpyDefault, stream));
+    check_cuda_error(cudaStreamSynchronize(stream));
+
+    using Tacc         = std::conditional_t<std::is_integral_v<T>, int64_t, float>;
+    constexpr Tacc eps = std::is_integral_v<T> ? 1 : 1e-8f;
+
+    Tacc asum{};
+    Tacc rsum{};
+    Tacc amean{};
+    for (size_t i = 0; i < size; ++i) {
+        Tacc x        = (Tacc)h_b[i];
+        Tacc r        = (Tacc)h_a[i];
+        Tacc abs_diff = std::abs(x - r);
+        Tacc rel_diff = abs_diff / std::max(std::max(std::abs(r), std::abs(x)), eps);
+        asum += abs_diff;
+        rsum += rel_diff;
+        amean += std::abs(r);
+    }
+
+    std::cerr << key << ": " << amean / size << " " << asum << " " << asum / size << " " << rsum / size << "\n";
+
+    check_cuda_error(cudaMemcpyAsync(ptr, h_a.data(), sizeof(T) * h_a.size(), cudaMemcpyDefault, stream));
+    check_cuda_error(cudaStreamSynchronize(stream));
 }
 
 template<typename T>
diff --git a/src/turbomind/models/llama/mla_utils.cu b/src/turbomind/models/llama/mla_utils.cu
new file mode 100644
index 0000000000..2f9e786f2a
--- /dev/null
+++ b/src/turbomind/models/llama/mla_utils.cu
@@ -0,0 +1,93 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "src/turbomind/kernels/core/array_ops.h"
+
+namespace turbomind {
+
+template<class T, int vec_size>
+__global__ void mla_copy_qkv_kernel(T*       qkv,
+                                    const T* q,     // [h, head_dim]
+                                    const T* kv_a,  // [kv_lora_rank, rope_dim]
+                                    const T* kv_b,  // [h, nope_dim + v_head_dim]
+                                    int      head_num,
+                                    int      head_dim,
+                                    int      nope_dim,
+                                    int      rope_dim,
+                                    int      kv_lora_rank,
+                                    int      v_head_dim)
+{
+    const int type = blockIdx.y;
+
+    const int64_t ti = blockIdx.x;
+    const int     di = threadIdx.x;
+
+    const int kv_b_dim = nope_dim + v_head_dim;
+
+    // for (int hi = threadIdx.y; hi < head_num; hi += blockDim.y) {
+    const int          hi = threadIdx.y;
+    Array<T, vec_size> data{};
+    if (type == 0) {  // Q
+        if (di * vec_size < rope_dim) {
+            Ldg(data, &q[ti * head_num * head_dim + hi * head_dim + nope_dim + di * vec_size]);
+        }
+        else {
+            Ldg(data, &q[ti * head_num * head_dim + hi * head_dim + di * vec_size - rope_dim]);
+        }
+    }
+    else if (type == 1) {  // K
+        if (di * vec_size < rope_dim) {
+            Ldg(data, &kv_a[ti * (kv_lora_rank + rope_dim) + kv_lora_rank + di * vec_size]);
+        }
+        else {
+            Ldg(data, &kv_b[ti * head_num * kv_b_dim + hi * kv_b_dim + di * vec_size - rope_dim]);
+        }
+    }
+    else {  // V
+        if (di * vec_size < v_head_dim) {
+            Ldg(data, &kv_b[ti * head_num * kv_b_dim + hi * kv_b_dim + nope_dim + di * vec_size]);
+        }
+    }
+    const int stride = 3 * head_num * head_dim;
+    Store(&qkv[ti * stride + type * head_num * head_dim + hi * head_dim + di * vec_size], data);
+    // }
+}
+
+template<class T>
+void invokeMLACopyQKV(T*           qkv,
+                      const T*     q,
+                      const T*     kv_a,
+                      const T*     kv_b,
+                      int          token_num,
+                      int          head_num,
+                      int          nope_dim,
+                      int          rope_dim,
+                      int          kv_lora_rank,
+                      int          v_head_dim,
+                      cudaStream_t stream)
+{
+    constexpr int vec_size = 16 / sizeof(T);
+    const int     head_dim = nope_dim + rope_dim;
+
+    dim3 block(head_dim / vec_size, head_num);
+    // make sure block size <= 1024
+    while (block.x * block.y > 1024) {
+        block.y /= 2;
+    }
+    const dim3 grid(token_num, 3);
+
+    mla_copy_qkv_kernel<T, vec_size><<<grid, block, 0, stream>>>(
+        qkv, q, kv_a, kv_b, head_num, head_dim, nope_dim, rope_dim, kv_lora_rank, v_head_dim);
+}
+
+template void invokeMLACopyQKV(uint16_t*       qkv,
+                               const uint16_t* q,
+                               const uint16_t* kv_a,
+                               const uint16_t* kv_b,
+                               int             token_num,
+                               int             head_num,
+                               int             nope_dim,
+                               int             rope_dim,
+                               int             kv_lora_rank,
+                               int             v_head_dim,
+                               cudaStream_t    stream);
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/llama/mla_utils.h b/src/turbomind/models/llama/mla_utils.h
new file mode 100644
index 0000000000..bc06a352f9
--- /dev/null
+++ b/src/turbomind/models/llama/mla_utils.h
@@ -0,0 +1,57 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#pragma once
+
+#include <cstdint>
+#include <cuda_runtime.h>
+
+#include "src/turbomind/utils/cuda_utils.h"
+
+namespace turbomind {
+
+template<class T>
+void invokeMLACopyQKV(T*           qkv,
+                      const T*     q,
+                      const T*     kv_a,
+                      const T*     kv_b,
+                      int          token_num,
+                      int          head_num,
+                      int          nope_dim,
+                      int          rope_dim,
+                      int          kv_lora_rank,
+                      int          v_head_dim,
+                      cudaStream_t stream);
+
+template<class T>
+void dispatchMLACopyQKV(T*           qkv,
+                        const T*     q,
+                        const T*     kv_a,
+                        const T*     kv_b,
+                        int          token_num,
+                        int          head_num,
+                        int          nope_dim,
+                        int          rope_dim,
+                        int          kv_lora_rank,
+                        int          v_head_dim,
+                        cudaStream_t stream)
+{
+    auto invoke = [&](auto x) {
+        using type = decltype(x);
+        invokeMLACopyQKV((type*)qkv,
+                         (const type*)q,
+                         (const type*)kv_a,
+                         (const type*)kv_b,
+                         token_num,
+                         head_num,
+                         nope_dim,
+                         rope_dim,
+                         kv_lora_rank,
+                         v_head_dim,
+                         stream);
+    };
+    if constexpr (sizeof(T) == 2) {
+        return invoke(uint16_t{});
+    }
+    FT_CHECK(0);
+}
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/llama/moe_ffn_layer.cc b/src/turbomind/models/llama/moe_ffn_layer.cc
index 1ad76839d1..390d147540 100644
--- a/src/turbomind/models/llama/moe_ffn_layer.cc
+++ b/src/turbomind/models/llama/moe_ffn_layer.cc
@@ -11,22 +11,21 @@
 #include "src/turbomind/utils/nvtx_utils.h"
 #include "src/turbomind/utils/string_utils.h"
 #include <cuda_runtime.h>
-#include <cuda_runtime_api.h>
 #include <iomanip>
 
 namespace turbomind {
 
 template<class T>
-void MoeFfnLayer<T>::AllocateBuffer(size_t tokens, size_t padded)
+void MoeFfnLayer<T>::AllocateBuffer(size_t tokens, size_t padded, size_t expert_num, size_t inter_buf_factor)
 {
     char* base = 0;
 
     auto allocate = [&](void* base) {
         Monotonic alloc{base};
         alloc(&inout_buf_, tokens * param_.experts_per_token * hidden_dim_);
-        alloc(&inter_buf_, tokens * param_.experts_per_token * inter_size_ * 2);
-        alloc(&logits_, tokens * param_.expert_num);
-        alloc(&masks_, param_.expert_num * padded);
+        alloc(&inter_buf_, tokens * param_.experts_per_token * inter_size_ * inter_buf_factor);
+        alloc(&logits_, tokens * expert_num);
+        alloc(&masks_, expert_num * padded);
         alloc(&f2n_, param_.experts_per_token * tokens);
         alloc(&en2f_, param_.experts_per_token * tokens);
         alloc(&scales_, param_.experts_per_token * tokens);
@@ -80,18 +79,42 @@ void MoeFfnLayer<T>::gate(float* logits, const T* input, int tokens, const Llama
 template<class T>
 void MoeFfnLayer<T>::forward(T* output, const T* input, int tokens, int layer_id, const MoeFfnWeight<T>& moe)
 {
-    const size_t padded = (tokens + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize;
+    const size_t padded     = (tokens + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize;
+    const int    expert_num = moe.experts.size();
 
-    AllocateBuffer(tokens, padded);
+    FT_CHECK(expert_num);
+
+    const size_t inter_buf_factor = [&] {
+        if (param_.method == MoeParam::kNaive) {
+            return 0;  // managed by ffn
+        }
+        else if (moe.block.is_fused_silu) {
+            return 1;
+        }
+        else {
+            return 2;
+        }
+    }();
+
+    AllocateBuffer(tokens, padded, expert_num, inter_buf_factor);
 
     gate(logits_, input, tokens, moe.gate);
     sync_check_cuda_error();
 
-    check_cuda_error(cudaMemsetAsync(accum_, 0, sizeof(int) * param_.expert_num * kMoeGateMaxTiles, stream_));
-    sync_check_cuda_error();
+    // if (tensor_para_.rank_ == 0) {
+    //     Compare(logits_, tokens * expert_num, Concat("logit", layer_id), compare_mode, stream_);
+    // }
+
+    check_cuda_error(cudaMemsetAsync(accum_, 0, sizeof(int) * expert_num * kMoeGateMaxTiles, stream_));
+    check_cuda_error(cudaMemsetAsync(masks_, -1, sizeof(int8_t) * expert_num * padded, stream_));
 
     // dump_logits(tokens, layer_id);
 
+    if (param_.topk_method == "group_limited_greedy") {
+        invokeMaskMoeTopKGroups(logits_, tokens, expert_num, expert_num / param_.n_group, param_.topk_group, stream_);
+        sync_check_cuda_error();
+    }
+
     /// TODO: fix illegal memory access even if NaN are present in logits
     invokeMoeGate_V2(f2n_,
                      en2f_,
@@ -102,25 +125,26 @@ void MoeFfnLayer<T>::forward(T* output, const T* input, int tokens, int layer_id
                      logits_,
                      tokens,
                      padded,
-                     param_.expert_num,
+                     expert_num,
                      param_.experts_per_token,
-                     param_.norm_topk,
+                     param_.norm_topk_prob,
+                     param_.routed_scale,
                      stream_);
     sync_check_cuda_error();
 
     if (isTuning()) {
         std::mt19937     g;
-        const auto       expert_ids = SampleUniform(tokens, param_.expert_num, param_.experts_per_token, g);
-        std::vector<int> cnt(param_.expert_num);
+        const auto       expert_ids = SampleUniform(tokens, expert_num, param_.experts_per_token, g);
+        std::vector<int> cnt(expert_num);
         for (const auto& x : expert_ids) {
             ++cnt[x];
         }
         h_offsets_[0] = 0;
-        for (int i = 0; i < param_.expert_num; ++i) {
+        for (int i = 0; i < expert_num; ++i) {
             h_offsets_[i + 1] = h_offsets_[i] + cnt[i];
         }
         check_cuda_error(
-            cudaMemcpyAsync(offsets_, h_offsets_, sizeof(int) * (param_.expert_num + 1), cudaMemcpyDefault, stream_));
+            cudaMemcpyAsync(offsets_, h_offsets_, sizeof(int) * (expert_num + 1), cudaMemcpyDefault, stream_));
     }
 
     if (param_.method == MoeParam::kNaive) {
@@ -129,15 +153,15 @@ void MoeFfnLayer<T>::forward(T* output, const T* input, int tokens, int layer_id
         sync_check_cuda_error();
 
         check_cuda_error(
-            cudaMemcpyAsync(h_offsets_, offsets_, sizeof(int) * (param_.expert_num + 1), cudaMemcpyDefault, stream_));
+            cudaMemcpyAsync(h_offsets_, offsets_, sizeof(int) * (expert_num + 1), cudaMemcpyDefault, stream_));
 
         check_cuda_error(cudaStreamSynchronize(stream_));
 
-        if (h_offsets_[param_.expert_num] != tokens * param_.experts_per_token) {
-            FT_CHECK_WITH_INFO(0, fmtstr("%d vs %d", h_offsets_[param_.expert_num], tokens * param_.experts_per_token));
+        if (h_offsets_[expert_num] != tokens * param_.experts_per_token) {
+            FT_CHECK_WITH_INFO(0, fmtstr("%d vs %d", h_offsets_[expert_num], tokens * param_.experts_per_token));
         }
 
-        for (int i = 0; i < param_.expert_num; ++i) {
+        for (int i = 0; i < expert_num; ++i) {
 
             FT_CHECK(moe.experts[i].is_fused_silu == false);
 
@@ -153,7 +177,7 @@ void MoeFfnLayer<T>::forward(T* output, const T* input, int tokens, int layer_id
         }
     }
     else {
-        context_->set_offsets(offsets_);
+        context_->update(expert_num, param_.experts_per_token, offsets_);
 
         auto& block = moe.block;
 
@@ -217,7 +241,7 @@ void MoeFfnLayer<T>::forward(T* output, const T* input, int tokens, int layer_id
 }
 
 template<class T>
-void MoeFfnLayer<T>::reduce(T* output, int tokens, const MoeFfnWeight<T>& moe)
+void MoeFfnLayer<T>::reduce(T* output, int tokens, float output_scale, int layer_id, const MoeFfnWeight<T>& moe)
 {
     invokeMoeReduce(output,
                     inout_buf_,
@@ -227,19 +251,21 @@ void MoeFfnLayer<T>::reduce(T* output, int tokens, const MoeFfnWeight<T>& moe)
                     tokens,
                     param_.experts_per_token,
                     hidden_dim_,
+                    output_scale,
                     stream_);
     sync_check_cuda_error();
 
     if (tensor_para_.world_size_ > 1) {
+        // std::cout << "moe all reduce " << layer_id << "\n";
         ftNcclAllReduceSum(output, output, tokens * hidden_dim_, tensor_para_, stream_);
         sync_check_cuda_error();
     }
 }
 
 template<class T>
-void MoeFfnLayer<T>::dump_logits(int token_num, int layer_id)
+void MoeFfnLayer<T>::dump_logits(int token_num, int layer_id, int expert_num)
 {
-    std::vector<float> logits(token_num * param_.expert_num);
+    std::vector<float> logits(token_num * expert_num);
     check_cuda_error(
         cudaMemcpyAsync(logits.data(), logits_, sizeof(float) * logits.size(), cudaMemcpyDefault, stream_));
     check_cuda_error(cudaStreamSynchronize(stream_));
@@ -247,7 +273,7 @@ void MoeFfnLayer<T>::dump_logits(int token_num, int layer_id)
     auto ptr = logits.data();
     std::cout << "layer_id: " << layer_id << std::endl;
     for (int i = 0; i < token_num; ++i) {
-        for (int e = 0; e < param_.expert_num; ++e) {
+        for (int e = 0; e < expert_num; ++e) {
             std::cout << *ptr++ << " ";
         }
         std::cout << std::endl;
diff --git a/src/turbomind/models/llama/moe_ffn_layer.h b/src/turbomind/models/llama/moe_ffn_layer.h
index 0f1713f7b5..74c62d004b 100644
--- a/src/turbomind/models/llama/moe_ffn_layer.h
+++ b/src/turbomind/models/llama/moe_ffn_layer.h
@@ -9,6 +9,7 @@
 #include "src/turbomind/models/llama/llama_params.h"
 #include "src/turbomind/utils/cublasMMWrapper.h"
 #include "src/turbomind/utils/nccl_utils.h"
+#include <algorithm>
 
 namespace turbomind {
 
@@ -26,23 +27,24 @@ class MoeFfnLayer {
         linear_(ctx.linear.get()),
         allocator_(ctx.allocator.get())
     {
-        model.inter_size = param.inter_size;
+        FT_CHECK(!param.expert_num.empty());
+        const int max_expert_num = *std::max_element(param.expert_num.begin(), param.expert_num.end());
 
         if (param_.method == MoeParam::kFused) {
             context_ = std::make_unique<gemm::MoeGemmContext>(
-                param.expert_num, param.experts_per_token, ctx.cuda_device_prop, stream_);
+                max_expert_num, param.experts_per_token, ctx.cuda_device_prop, stream_);
         }
         else {
-            expert_ffn_ = std::make_unique<LlamaFfnLayer<T>>(model, tp, ctx, false);
+            expert_ffn_ = std::make_unique<LlamaFfnLayer<T>>(model, tp, ctx);
         }
 
-        h_offsets_ = (int*)allocator_->malloc(sizeof(int) * (param_.expert_num + 1), false, true);
+        h_offsets_ = (int*)allocator_->malloc(sizeof(int) * (max_expert_num + 1), false, true);
 
-        offsets_ = (int*)allocator_->malloc(sizeof(int) * (param_.expert_num + 1));
-        accum_   = (int*)allocator_->malloc(sizeof(int) * param_.expert_num * kMoeGateMaxTiles);
+        offsets_ = (int*)allocator_->malloc(sizeof(int) * (max_expert_num + 1));
+        accum_   = (int*)allocator_->malloc(sizeof(int) * max_expert_num * kMoeGateMaxTiles);
     }
 
-    void AllocateBuffer(size_t tokens, size_t padded);
+    void AllocateBuffer(size_t tokens, size_t padded, size_t expert_num, size_t inter_buf_factor);
 
     void FreeBuffer();
 
@@ -53,11 +55,11 @@ class MoeFfnLayer {
 
     void forward(T* output, const T* input, int tokens, int layer_id, const MoeFfnWeight<T>& moe);
 
-    void reduce(T* output, int tokens, const MoeFfnWeight<T>& moe);
+    void reduce(T* output, int tokens, float output_scale, int layer_id, const MoeFfnWeight<T>& moe);
 
     void gate(float* logits, const T* input, int tokens, const LlamaDenseWeight<T>& weight);
 
-    void dump_logits(int token_num, int layer_id);
+    void dump_logits(int token_num, int layer_id, int expert_num);
 
 private:
     const size_t           inter_size_;
diff --git a/src/turbomind/models/llama/unified_attention_layer.cc b/src/turbomind/models/llama/unified_attention_layer.cc
index 2f99b0c2ce..7a6eddc4ba 100644
--- a/src/turbomind/models/llama/unified_attention_layer.cc
+++ b/src/turbomind/models/llama/unified_attention_layer.cc
@@ -19,21 +19,24 @@
 // Modified from
 // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.cc
 
-#include "src/turbomind/models/llama/unified_attention_layer.h"
+#include <algorithm>
+#include <math.h>
+
 #include "src/turbomind/kernels/attention/attention.h"
 #include "src/turbomind/kernels/attention/decoding.h"
 #include "src/turbomind/kernels/attention/kv_cache_utils_v2.h"
+#include "src/turbomind/kernels/norm/rms_norm.h"
 #include "src/turbomind/macro.h"
 #include "src/turbomind/models/llama/LlamaNcclGuard.h"
 #include "src/turbomind/models/llama/llama_kernels.h"
 #include "src/turbomind/models/llama/llama_utils.h"
+#include "src/turbomind/models/llama/mla_utils.h"
+#include "src/turbomind/models/llama/unified_attention_layer.h"
 #include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/anomaly_handler.h"
 #include "src/turbomind/utils/cuda_utils.h"
-#include "src/turbomind/utils/debug_utils.h"
 #include "src/turbomind/utils/logger.h"
-#include <algorithm>
-#include <math.h>
+#include "src/turbomind/utils/memory_utils.h"
 
 namespace turbomind {
 
@@ -72,17 +75,14 @@ UnifiedAttentionLayer<T>::UnifiedAttentionLayer(const ModelParam&     model,
 }
 
 template<typename T>
-void UnifiedAttentionLayer<T>::allocateBuffer(size_t            q_count,
-                                              size_t            k_count,
-                                              size_t            batch_size,
-                                              const WeightType* weights)
+void UnifiedAttentionLayer<T>::allocateBuffer(size_t q_count, size_t k_count, size_t batch_size, size_t qkv_lora_rank)
 {
     TM_LOG_DEBUG(__PRETTY_FUNCTION__);
 
     const int local_q_kv_head_num = local_head_num_ + 2 * local_kv_head_num_;
 
-    if (weights->qkv.lora.r) {
-        size_t sz = sizeof(T) * q_count * (local_q_kv_head_num * size_per_head_ + weights->qkv.lora.r);
+    if (qkv_lora_rank) {
+        size_t sz = sizeof(T) * q_count * (local_q_kv_head_num * size_per_head_ + qkv_lora_rank);
         qkv_buf_  = (T*)allocator_->reMalloc(qkv_buf_, sz, false);
     }
     else {
@@ -198,28 +198,38 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
     allocateBuffer(token_num,                                           // shared
                    h_cu_k_len[batch_size] - h_cu_k_len[dc_batch_size],  // prefill
                    batch_size,
-                   weights);
+                   weights->qkv.lora.r);
 
     // [L, 2, H, s, D]
     const size_t layer_offset = layer_id * 2 * local_kv_head_num_ * param_.cache_block_seq_len * size_per_head_;
 
-    static int count = 0;
+    // static int count = 0;
 
-    // if (layer_id == 0 && count == 0) {
-    //     Compare(attention_input, token_num * weights->qkv.input_dims, "qkv_input", compare_mode, stream_);
+    // if (tensor_para_.rank_ == 0) {
+    //     Compare(attention_input, token_num * hidden_units_, Concat("qkv_input", layer_id), compare_mode, stream_);
     // }
 
     int* lora_mask = inputs->at("lora_mask", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr<int>();
-    //////////////////////////////////////////////
-    /// qkv gemm
-    // [token_num, hidden_dim] -> [token_num, 3, local_hidden_dim]
-    linear_->forward(qkv_buf_, attention_input, token_num, weights->qkv, LlamaLinear<T>::kGemm, lora_mask);
-    sync_check_cuda_error();
+
+    if (weights->qkv.output_dims) {
+        //////////////////////////////////////////////
+        /// qkv gemm
+        // [token_num, hidden_dim] -> [token_num, 3, local_hidden_dim]
+        linear_->forward(qkv_buf_, attention_input, token_num, weights->qkv, LlamaLinear<T>::kGemm, lora_mask);
+        sync_check_cuda_error();
+    }
+    else {
+        forward_mla(attention_input, token_num, *weights);
+    }
+
+    // std::cerr << layer_id << " " << count << " " << tensor_para_.rank_ << "\n";
 
     count_and_fix(qkv_buf_, token_num * weights->qkv.output_dims, Concat("qkv", layer_id), 3);
 
-    // if (layer_id == 0 && count == 0) {
-    //     Compare(qkv_buf_, token_num * weights->qkv.output_dims, "qkv_buf", compare_mode, stream_);
+    // std::cerr << "token num: " << token_num << "\n";
+
+    // if (layer_id == 0 && count == 0 && tensor_para_.rank_ == 0) {
+    //     Compare(qkv_buf_, token_num * (3 * local_head_num_ * size_per_head_), "qkv_buf", CMP_MODE, stream_);
     // }
 
     if constexpr (0) {
@@ -290,8 +300,15 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
         params.num_heads     = local_head_num_;
         params.num_kv_heads  = local_kv_head_num_;
         params.size_per_head = size_per_head_;
+
         // MSVC does not have M_LOG2E
-        params.inv_sqrt_dh = (float)std::log2(expf(1.)) / std::sqrt((float)params.size_per_head);
+        params.inv_sqrt_dh = (float)std::log2(expf(1.));
+        if (param_.softmax_scale) {  // model predefined softmax scale
+            params.inv_sqrt_dh *= param_.softmax_scale;
+        }
+        else {  // default value
+            params.inv_sqrt_dh /= std::sqrt((float)params.size_per_head);
+        }
 
         params.rotary_embedding_dim    = param_.rotary_embedding_dim;
         params.rotary_embedding_base   = param_.rotary_embedding_base;
@@ -324,8 +341,9 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
             };
             float low, high;
             find_correction_range(param_.beta_fast, param_.beta_slow, low, high);
+            // https://github.com/huggingface/transformers/blob/6c3f168b36882f0beebaa9121eafa1928ba29633/src/transformers/modeling_rope_utils.py#L216
             if (low == high) {
-                high += 0.01f;
+                high += 0.001f;
             }
             params.yarn_ramp_inv_factor_div_2   = 1.0 / (high - low) / 2.0;
             params.yarn_ramp_inv_factor_mul_min = 1.0 / (high - low) * low;
@@ -415,8 +433,6 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
     linear_->forward(attention_out, qkv_buf_3_, token_num, weights->output, LlamaLinear<T>::kGemm, lora_mask);
     sync_check_cuda_error();
 
-    // ++count;
-
     count_and_fix(attention_out, token_num * weights->output.output_dims, Concat("wo", layer_id), 3);
 
     if (tensor_para_.world_size_ > 1) {
@@ -425,10 +441,94 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
         sync_check_cuda_error();
     }
 
+    // if (tensor_para_.rank_ == 0) {
+    //     Compare(attention_out, token_num * hidden_units_, Concat("attn_out", layer_id), compare_mode, stream_);
+    //     // dump(qkv_buf_3_, num_token * weights->output.input_dims, stream_, "qkv_buf_3");
+    // }
+
     if (is_free_buffer_after_forward_ == true) {
         freeBuffer();
     }
     sync_check_cuda_error();
+
+    // ++count;
+}
+
+template<typename T>
+void UnifiedAttentionLayer<T>::forward_mla(const T* inputs, int token_num, const WeightType& w)
+{
+    const int q_lora_rank  = w.q_a_proj.output_dims;
+    const int kv_lora_rank = w.kv_b_proj.input_dims;
+    const int qk_rope_dim  = w.kv_a_proj.output_dims - kv_lora_rank;
+    const int qk_nope_dim  = std::max(w.q_b_proj.output_dims, w.q_proj.output_dims) / local_head_num_ - qk_rope_dim;
+    const int v_head_dim   = w.kv_b_proj.output_dims / local_head_num_ - qk_nope_dim;
+
+    T* q{};
+
+    if (w.q_proj.kernel) {
+        deviceMalloc((T**)&q, (size_t)token_num * w.q_proj.output_dims, stream_);
+        linear_->forward(q, inputs, token_num, w.q_proj);
+        sync_check_cuda_error();
+    }
+    else {
+        T* q_a{};
+        deviceMalloc((T**)&q_a, (size_t)token_num * q_lora_rank, stream_);
+
+        linear_->forward(q_a, inputs, token_num, w.q_a_proj);
+        sync_check_cuda_error();
+
+        invokeRMSNorm(q_a,
+                      q_lora_rank,
+                      q_a,
+                      q_lora_rank,
+                      w.q_a_layernorm,
+                      q_lora_rank,
+                      token_num,
+                      model_param_.norm_eps,
+                      stream_);
+        sync_check_cuda_error();
+
+        deviceMalloc((T**)&q, (size_t)token_num * w.q_b_proj.output_dims, stream_);
+        linear_->forward(q, q_a, token_num, w.q_b_proj);
+        sync_check_cuda_error();
+
+        deviceFree(q_a, stream_);
+    }
+
+    T*        kv_a{};
+    const int kv_a_dim = w.kv_a_proj.output_dims;
+    deviceMalloc((T**)&kv_a, (size_t)token_num * kv_a_dim, stream_);
+
+    linear_->forward(kv_a, inputs, token_num, w.kv_a_proj);
+    sync_check_cuda_error();
+
+    invokeRMSNorm(
+        kv_a, kv_a_dim, kv_a, kv_a_dim, w.kv_a_layernorm, kv_lora_rank, token_num, model_param_.norm_eps, stream_);
+    sync_check_cuda_error();
+
+    T* kv_b{};
+    deviceMalloc((T**)&kv_b, (size_t)token_num * w.kv_b_proj.output_dims, stream_);
+    sync_check_cuda_error();
+
+    linear_->forward(kv_b, {kv_a, kv_a_dim}, token_num, w.kv_b_proj);
+    sync_check_cuda_error();
+
+    dispatchMLACopyQKV(qkv_buf_,
+                       q,
+                       kv_a,
+                       kv_b,
+                       token_num,
+                       local_head_num_,
+                       qk_nope_dim,
+                       qk_rope_dim,
+                       kv_lora_rank,
+                       v_head_dim,
+                       stream_);
+    sync_check_cuda_error();
+
+    deviceFree(q, stream_);
+    deviceFree(kv_a, stream_);
+    deviceFree(kv_b, stream_);
 }
 
 #ifdef ENABLE_FP32
diff --git a/src/turbomind/models/llama/unified_attention_layer.h b/src/turbomind/models/llama/unified_attention_layer.h
index da0c0e6fc8..7d331b0e41 100644
--- a/src/turbomind/models/llama/unified_attention_layer.h
+++ b/src/turbomind/models/llama/unified_attention_layer.h
@@ -42,7 +42,7 @@ class UnifiedAttentionLayer {
     static constexpr int kMaxWorkspaceTokens = 4096;
 
     void freeBuffer();
-    void allocateBuffer(size_t q_count, size_t k_count, size_t batch_size, const WeightType* weights);
+    void allocateBuffer(size_t q_count, size_t k_count, size_t batch_size, size_t qkv_lora_rank);
 
     void allocateWorkspace();
     void freeWorkspace();
@@ -70,7 +70,7 @@ class UnifiedAttentionLayer {
                           const NcclParam&      tp,
                           const Context<T>&     context);
 
-    void forward(TensorMap* outputs, const TensorMap* inputs, const LlamaAttentionWeight<T>* weights);
+    void forward(TensorMap* outputs, const TensorMap* inputs, const WeightType* weights);
 
     void prefill(T*                output,
                  T*                tmp_kv_buffer,
@@ -107,6 +107,9 @@ class UnifiedAttentionLayer {
                 int               max_split_k,
                 const WeightType* weights);
 
+private:
+    void forward_mla(const T* inputs, int token_num, const WeightType& weights);
+
 private:
     const size_t head_num_;
     const size_t kv_head_num_;
diff --git a/src/turbomind/models/llama/unified_decoder.cc b/src/turbomind/models/llama/unified_decoder.cc
index 28e8b5f649..ec0e75b7e5 100644
--- a/src/turbomind/models/llama/unified_decoder.cc
+++ b/src/turbomind/models/llama/unified_decoder.cc
@@ -1,13 +1,17 @@
 
-#include "src/turbomind/models/llama/unified_decoder.h"
+
+#include <cuda_runtime.h>
+
+#include "src/turbomind/kernels/norm/rms_norm.h"
 #include "src/turbomind/models/llama/llama_decoder_kernels.h"
 #include "src/turbomind/models/llama/llama_kernels.h"
 #include "src/turbomind/models/llama/llama_utils.h"
 #include "src/turbomind/models/llama/moe_ffn_layer.h"
 #include "src/turbomind/models/llama/unified_attention_layer.h"
+#include "src/turbomind/models/llama/unified_decoder.h"
+#include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/anomaly_handler.h"
 #include "src/turbomind/utils/cuda_utils.h"
-#include <cuda_runtime.h>
 
 namespace turbomind {
 
@@ -23,17 +27,19 @@ UnifiedDecoder<T>::UnifiedDecoder(const ModelParam&     model,
     rmsnorm_eps_(model.norm_eps),
     stream_(ctx.stream),
     allocator_(ctx.allocator.get()),
-    dtype_(getTensorType<T>())
+    tp_(tp),
+    dtype_(getTensorType<T>()),
+    tune_layer_num_(model.tune_layer_num)
 {
 
     attn_layer_ = std::make_unique<UnifiedAttentionLayer<T>>(model, attn, lora, tp, ctx);
 
-    if (moe.expert_num) {
+    if (std::accumulate(moe.expert_num.begin(), moe.expert_num.end(), 0LL)) {
         moe_ffn_layer_ = std::make_unique<MoeFfnLayer<T>>(model, moe, tp, ctx);
     }
 
-    if (model.inter_size) {
-        ffn_layer_ = std::make_unique<LlamaFfnLayer<T>>(model, tp, ctx, !moe_ffn_layer_);
+    if (std::accumulate(model.inter_size.begin(), model.inter_size.end(), 0LL)) {
+        ffn_layer_ = std::make_unique<LlamaFfnLayer<T>>(model, tp, ctx);
     }
 
     check_cuda_error(cudaEventCreateWithFlags(&ev_h_cu_x_, cudaEventDisableTiming));
@@ -65,13 +71,13 @@ void UnifiedDecoder<T>::freeBuffer()
 }
 
 template<typename T>
-void UnifiedDecoder<T>::forwardSelfAttn(T*                             attn_io,
-                                        TensorMap*                     _outputs,
-                                        const TensorMap*               _inputs,
-                                        size_t                         token_num,
-                                        size_t                         batch_size,
-                                        int                            layer_id,
-                                        const LlamaAttentionWeight<T>* weight)
+void UnifiedDecoder<T>::forwardSelfAttn(T*                attn_io,
+                                        TensorMap*        _outputs,
+                                        const TensorMap*  _inputs,
+                                        size_t            token_num,
+                                        size_t            batch_size,
+                                        int               layer_id,
+                                        const WeightType* weight)
 {
     TensorMap inputs(*_inputs);
     inputs.insert("input_query", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, attn_io});
@@ -84,7 +90,7 @@ void UnifiedDecoder<T>::forwardSelfAttn(T*                             attn_io,
     TensorMap outputs(*_outputs);
     outputs.insert("hidden_features", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, attn_io});
 
-    attn_layer_->forward(&outputs, &inputs, weight);
+    attn_layer_->forward(&outputs, &inputs, &weight->self_attn_weights);
 }
 
 template<typename T>
@@ -141,19 +147,15 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
 
     const int pf_offset = dc_batch_size;
 
-    // Compare(decoder_input_output, token_num * hidden_units_, "decoder_input", kCmpRead, stream_);
-
-    // printf("%d %f\n", (int)token_num, rmsnorm_eps_);
-
     /////////////////////////////////////////////
     /// RMSNorm
-    invokeRootMeanSquareNorm(decoder_output,
-                             decoder_input_output,
-                             weights->at(0)->self_attn_norm_weights,
-                             rmsnorm_eps_,
-                             token_num,
-                             hidden_units_,
-                             stream_);
+    invokeRMSNorm(decoder_output,
+                  decoder_input_output,
+                  weights->at(0)->self_attn_norm_weights,
+                  hidden_units_,
+                  token_num,
+                  rmsnorm_eps_,
+                  stream_);
     sync_check_cuda_error();
 
     count_and_fix(decoder_output, token_num * hidden_units_, Concat("norm0", 0), 2);
@@ -161,12 +163,10 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
     for (size_t layer = 0; layer < layer_num_; ++layer) {
 
         /// TODO: do not skip the layers when they are heterogeneous
-        if (isTuning() && layer != 0) {
+        if (isTuning() && layer >= tune_layer_num_) {
             continue;
         }
 
-        // Compare(decoder_output, token_num * hidden_units_, "attn_input", kCmpRead, stream_);
-
         /////////////////////////////////////////////
         /// self-attention
         forwardSelfAttn(decoder_output,  //
@@ -175,18 +175,18 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
                         token_num,
                         batch_size,
                         layer,
-                        &weights->at(layer)->self_attn_weights);
+                        weights->at(layer));
 
         count_and_fix(decoder_output, token_num * hidden_units_, Concat("attn_block", layer), 2);
 
-        invokeFusedAddBiasResidualRMSNorm(decoder_input_output,
-                                          decoder_output,
-                                          weights->at(layer)->self_attn_weights.output.bias,
-                                          weights->at(layer)->ffn_norm_weights,
-                                          rmsnorm_eps_,
-                                          token_num,
-                                          hidden_units_,
-                                          stream_);
+        invokeBiasResidualRMSNorm(decoder_input_output,
+                                  decoder_output,
+                                  weights->at(layer)->ffn_norm_weights,
+                                  weights->at(layer)->self_attn_weights.output.bias,
+                                  hidden_units_,
+                                  token_num,
+                                  rmsnorm_eps_,
+                                  stream_);
         sync_check_cuda_error();
 
         count_and_fix(decoder_input_output, token_num * hidden_units_, Concat("residual0", layer), 2);
@@ -195,14 +195,17 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
         ////////////////////////////////////////////
         /// feed-forward network
 
-        if (!weights->at(layer)->moe_weights.experts.empty()) {
+        const bool is_moe = !weights->at(layer)->moe_weights.experts.empty();
+        if (is_moe) {
             moe_ffn_layer_->forward(nullptr, decoder_output, token_num, layer, weights->at(layer)->moe_weights);
         }
 
-        if (ffn_layer_) {
-            int       layer_id = layer;  // int is needed
+        if (weights->at(layer)->ffn_weights.output.kernel) {
+            int       layer_id   = layer;  // int is needed
+            bool      all_reduce = !is_moe;
             TensorMap ffn_inputs{{"ffn_input", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, decoder_output}},
-                                 {"layer_id", {MEMORY_CPU, TYPE_INT32, {1}, &layer_id}}};
+                                 {"layer_id", {MEMORY_CPU, TYPE_INT32, {1}, &layer_id}},
+                                 {"all_reduce", {MEMORY_CPU, TYPE_BOOL, {1}, &all_reduce}}};
             TensorMap ffn_outputs{{"ffn_output", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, decoder_output}}};
             if (inputs->isExist("lora_mask")) {
                 ffn_inputs.insert({"lora_mask", inputs->at("lora_mask")});
@@ -210,8 +213,8 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
             ffn_layer_->forward(&ffn_outputs, &ffn_inputs, &weights->at(layer)->ffn_weights);
         }
 
-        if (!weights->at(layer)->moe_weights.experts.empty()) {
-            moe_ffn_layer_->reduce(decoder_output, token_num, weights->at(layer)->moe_weights);
+        if (is_moe) {
+            moe_ffn_layer_->reduce(decoder_output, token_num, (bool)ffn_layer_, layer, weights->at(layer)->moe_weights);
         }
 
         count_and_fix(decoder_output, token_num * hidden_units_, Concat("ffn_block", layer), 2);
diff --git a/src/turbomind/models/llama/unified_decoder.h b/src/turbomind/models/llama/unified_decoder.h
index f13b4ba842..e08567136d 100644
--- a/src/turbomind/models/llama/unified_decoder.h
+++ b/src/turbomind/models/llama/unified_decoder.h
@@ -22,7 +22,9 @@ class UnifiedDecoder {
     const float        rmsnorm_eps_;
     cudaStream_t const stream_;
     IAllocator* const  allocator_;
+    const NcclParam    tp_;
     const DataType     dtype_;
+    const int          tune_layer_num_;
     bool               is_free_buffer_after_forward_{};
 
     int* cu_q_len_{};
@@ -39,13 +41,13 @@ class UnifiedDecoder {
 
     using WeightType = LlamaDecoderLayerWeight<T>;
 
-    void forwardSelfAttn(T*                             attn_io,
-                         TensorMap*                     _outputs,
-                         const TensorMap*               _inputs,
-                         size_t                         token_num,
-                         size_t                         batch_size,
-                         int                            layer_id,
-                         const LlamaAttentionWeight<T>* weight);
+    void forwardSelfAttn(T*                attn_io,
+                         TensorMap*        _outputs,
+                         const TensorMap*  _inputs,
+                         size_t            token_num,
+                         size_t            batch_size,
+                         int               layer_id,
+                         const WeightType* weight);
 
 public:
     UnifiedDecoder(const ModelParam&     model,
diff --git a/src/turbomind/models/llama/weight_type.h b/src/turbomind/models/llama/weight_type.h
new file mode 100644
index 0000000000..bc2f49a08e
--- /dev/null
+++ b/src/turbomind/models/llama/weight_type.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include <cstdint>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+namespace turbomind {
+
+enum class WeightType : int
+{
+    kFP32,
+    kFP16,
+    kFP8,  // not supported yet
+    kBF16,
+    kINT8,
+    kINT4
+};
+
+template<class T>
+constexpr WeightType get_default_weight_type()
+{
+    if constexpr (std::is_same_v<T, half>) {
+        return WeightType::kFP16;
+    }
+    else if constexpr (std::is_same_v<T, nv_bfloat16>) {
+        return WeightType::kBF16;
+    }
+    else if constexpr (std::is_same_v<T, float>) {
+        return WeightType::kFP32;
+    }
+    else {
+        static_assert(sizeof(T) != sizeof(T), "not implemented");
+        return {};
+    }
+}
+
+inline size_t getBitSize(WeightType type)
+{
+    switch (type) {
+        case WeightType::kFP32:
+            return 32;
+        case WeightType::kFP16:
+            return 16;
+        case WeightType::kFP8:
+            return 8;
+        case WeightType::kBF16:
+            return 16;
+        case WeightType::kINT8:
+            return 8;
+        case WeightType::kINT4:
+            return 4;
+    }
+    return 0;
+}
+
+}  // namespace turbomind
diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp
index 4eb34249ff..5a344d9545 100644
--- a/src/turbomind/python/bind.cpp
+++ b/src/turbomind/python/bind.cpp
@@ -215,6 +215,51 @@ DLTensor GetDLTensor(py::object obj)
     return dlmt->dl_tensor;
 }
 
+static void safe_memcpy(void* dst, const void* src, size_t size)
+{
+    cudaPointerAttributes dat{};
+    cudaPointerAttributes sat{};
+    ft::check_cuda_error(cudaPointerGetAttributes(&dat, dst));
+    ft::check_cuda_error(cudaPointerGetAttributes(&sat, src));
+    try {
+        if (dat.devicePointer && sat.devicePointer) {
+            // Both can be accessed from current context
+            ft::check_cuda_error(cudaMemcpy(dst, src, size, cudaMemcpyDefault));
+        }
+        else if (dat.type == cudaMemoryTypeDevice && sat.type == cudaMemoryTypeDevice) {
+            if (dat.device != sat.device) {
+                // On different devices, try peer memcpy
+                ft::check_cuda_error(cudaMemcpyPeer(dst, dat.device, src, sat.device, size));
+            }
+            else {
+                // Same device, switch to the device first (this is unlikely)
+                ft::CudaDeviceGuard guard(dat.device);
+                ft::check_cuda_error(cudaMemcpy(dst, src, size, cudaMemcpyDefault));
+            }
+        }
+        else {
+            // Unknown case, give it a try anyway
+            ft::check_cuda_error(cudaMemcpy(dst, src, size, cudaMemcpyDefault));
+        }
+    }
+    catch (...) {
+        int device_id{-1};
+        cudaGetDevice(&device_id);
+        TM_LOG_ERROR("cudaMemcpy failed: dst=(%d, %d, %p, %p), src=(%d, %d, %p, %p), size=%s, device=%d",
+                     (int)dat.type,
+                     dat.device,
+                     dat.devicePointer,
+                     dat.hostPointer,
+                     (int)sat.type,
+                     sat.device,
+                     sat.devicePointer,
+                     sat.hostPointer,
+                     std::to_string(size).c_str(),
+                     device_id);
+        throw;
+    }
+}
+
 PYBIND11_MODULE(_turbomind, m)
 {
     // nccl param
@@ -293,8 +338,7 @@ PYBIND11_MODULE(_turbomind, m)
                             std::accumulate(src->shape.begin(), src->shape.end(), 1LL, std::multiplies<int64_t>());
                         auto num_bytes = num_element * dlmt->dl_tensor.dtype.bits / 8;
                         ft::FT_CHECK(self->shape.size() == 1 && num_bytes == self->shape[0]);
-                        cudaMemcpy(
-                            const_cast<void*>(self->data), const_cast<void*>(src->data), num_bytes, cudaMemcpyDefault);
+                        safe_memcpy(const_cast<void*>(self->data), src->data, num_bytes);
                         break;
                     }
                     default:
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
index 2deca46380..1c7c5eb468 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -256,22 +256,30 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
     model_param_.kv_head_num        = model_reader["kv_head_num"].as<int>(0);
     model_param_.hidden_units       = model_reader["hidden_units"].as<int>();
     model_param_.layer_num          = model_reader["num_layer"].as<int>();
-    model_param_.inter_size         = model_reader["inter_size"].as<int>();
     model_param_.vocab_size         = model_reader["vocab_size"].as<int>();
     model_param_.embedding_size     = model_reader["embedding_size"].as<int>();
     model_param_.norm_eps           = model_reader["norm_eps"].as<float>();
     model_param_.start_id           = model_reader["start_id"].as<int>();
     model_param_.end_id             = model_reader["end_id"].as<int>();
+    model_param_.tune_layer_num     = model_reader["tune_layer_num"].as<int>(1);
+    model_param_.mla.q_lora_rank    = model_reader["q_lora_rank"].as<int>();
+    model_param_.mla.kv_lora_rank   = model_reader["kv_lora_rank"].as<int>();
+    model_param_.mla.qk_rope_dim    = model_reader["qk_rope_dim"].as<int>();
+    model_param_.mla.v_head_dim     = model_reader["v_head_dim"].as<int>();
     attn_param_.cache_block_seq_len = attention_reader["cache_block_seq_len"].as<int>(0);
     model_param_.quant_policy       = engine_reader["quant_policy"].as<int>(0);
-
+    YAML::Node inter_size           = model_reader["inter_size"];
+    for (auto it = inter_size.begin(); it != inter_size.end(); ++it) {
+        model_param_.inter_size.push_back(it->as<int>());
+    }
     // Only weight classes need these
-    attn_bias_  = model_reader["attn_bias"].as<int>(0);
-    group_size_ = model_reader["group_size"].as<int>(0);
+    model_param_.attn_bias  = model_reader["attn_bias"].as<int>(0);
+    model_param_.group_size = model_reader["group_size"].as<int>(0);
 
     // rotary embedding parameters
     attn_param_.rotary_embedding_dim    = attention_reader["rotary_embedding"].as<int>();
     attn_param_.rotary_embedding_base   = attention_reader["rope_theta"].as<float>(10000.0f);
+    attn_param_.softmax_scale           = attention_reader["softmax_scale"].as<float>(0);
     attn_param_.attention_factor        = attention_reader["attention_factor"].as<float>(-1.f);
     attn_param_.beta_fast               = attention_reader["beta_fast"].as<float>(32.f);
     attn_param_.beta_slow               = attention_reader["beta_slow"].as<float>(1.f);
@@ -297,19 +305,27 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
     engine_param_.num_tokens_per_iter = engine_reader["num_tokens_per_iter"].as<int>(0);
     engine_param_.max_prefill_iters   = engine_reader["max_prefill_iters"].as<int>(1);
 
-    lora_param_.policy           = ft::getLoraPolicy(reader["lora_config"]["lora_policy"].as<std::string>(""));
-    lora_param_.r                = lora_reader["lora_r"].as<int>(0);
-    lora_param_.scale            = lora_reader["lora_scale"].as<float>(0);
-    lora_param_.max_wo_r         = lora_reader["lora_max_wo_r"].as<int>(0);
-    lora_param_.rank_pattern     = getLoraPattern<int>(lora_reader["lora_rank_pattern"].as<std::string>(""),
+    lora_param_.policy        = ft::getLoraPolicy(reader["lora_config"]["lora_policy"].as<std::string>(""));
+    lora_param_.r             = lora_reader["lora_r"].as<int>(0);
+    lora_param_.scale         = lora_reader["lora_scale"].as<float>(0);
+    lora_param_.max_wo_r      = lora_reader["lora_max_wo_r"].as<int>(0);
+    lora_param_.rank_pattern  = getLoraPattern<int>(lora_reader["lora_rank_pattern"].as<std::string>(""),
                                                    [](const std::string& s) { return std::stoi(s); });
-    lora_param_.scale_pattern    = getLoraPattern<float>(lora_reader["lora_scale_pattern"].as<std::string>(""),
+    lora_param_.scale_pattern = getLoraPattern<float>(lora_reader["lora_scale_pattern"].as<std::string>(""),
                                                       [](const std::string& s) { return std::stof(s); });
-    moe_param_.expert_num        = model_reader["expert_num"].as<int>(0);
+
     moe_param_.experts_per_token = model_reader["experts_per_token"].as<int>(0);
     moe_param_.inter_size        = model_reader["expert_inter_size"].as<int>(0);
-    moe_param_.shared_gate       = model_reader["moe_shared_gate"].as<int>(0);
-    moe_param_.norm_topk         = model_reader["moe_norm_topk"].as<bool>(false);
+    moe_param_.shared_gate       = model_reader["moe_shared_gate"].as<bool>();
+    moe_param_.norm_topk_prob    = model_reader["norm_topk_prob"].as<bool>();
+    moe_param_.routed_scale      = model_reader["routed_scale"].as<float>(1.f);
+    moe_param_.topk_group        = model_reader["topk_group"].as<int>(1);
+    moe_param_.topk_method       = model_reader["topk_method"].as<std::string>("greedy");
+    moe_param_.n_group           = model_reader["moe_group_num"].as<int>(1);
+    YAML::Node expert_num        = model_reader["expert_num"];
+    for (auto it = expert_num.begin(); it != expert_num.end(); ++it) {
+        moe_param_.expert_num.push_back(it->as<int>());
+    }
 
     handleMissingParams();
 
@@ -321,19 +337,19 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
 
     const std::string weight_type_str = model_reader["weight_type"].as<std::string>();
     if (weight_type_str == "fp16" || weight_type_str == "float16") {
-        weight_type_ = ft::WeightType::kFP16;
+        model_param_.weight_type = ft::WeightType::kFP16;
     }
     else if (weight_type_str == "bf16" || weight_type_str == "bfloat16") {
-        weight_type_ = ft::WeightType::kBF16;
+        model_param_.weight_type = ft::WeightType::kBF16;
     }
     else if (weight_type_str == "fp32") {
-        weight_type_ = ft::WeightType::kFP32;
+        model_param_.weight_type = ft::WeightType::kFP32;
     }
     else if (weight_type_str == "int8") {
-        weight_type_ = ft::WeightType::kINT8;
+        model_param_.weight_type = ft::WeightType::kINT8;
     }
     else if (weight_type_str == "int4") {
-        weight_type_ = ft::WeightType::kINT4;
+        model_param_.weight_type = ft::WeightType::kINT4;
     }
     else {
         std::cout << "[ERROR] Unsupported weight type: '" << weight_type_str << "'\n";
@@ -418,21 +434,8 @@ void LlamaTritonModel<T>::createSharedWeights(int device_id, int rank)
     const int tensor_para_rank   = rank % tensor_para_size_;
     const int pipeline_para_rank = rank / tensor_para_size_;
     ft::FT_CHECK(pipeline_para_size_ == 1 && pipeline_para_rank == 0);
-    weights_[device_id] = std::make_shared<ft::LlamaWeight<T>>(model_param_.head_num,
-                                                               model_param_.kv_head_num,
-                                                               model_param_.head_dim,
-                                                               model_param_.hidden_units,
-                                                               model_param_.inter_size,
-                                                               model_param_.vocab_size,
-                                                               model_param_.embedding_size,
-                                                               model_param_.layer_num,
-                                                               attn_bias_,
-                                                               weight_type_,
-                                                               group_size_,
-                                                               lora_param_,
-                                                               moe_param_,
-                                                               tensor_para_size_,
-                                                               tensor_para_rank);
+    weights_[device_id] = std::make_shared<ft::LlamaWeight<T>>(
+        model_param_, lora_param_, moe_param_, tensor_para_size_, tensor_para_rank);
     // model inited with model_dir
     if (model_dir_ != "") {
         weights_[device_id]->loadModel(model_dir_);
@@ -488,9 +491,11 @@ std::string LlamaTritonModel<T>::toString()
     std::stringstream ss;
     ss << "Model: "  //
        << "\nhead_num: " << model_param_.head_num << "\nkv_head_num: " << model_param_.kv_head_num
-       << "\nsize_per_head: " << model_param_.head_dim << "\ninter_size: " << model_param_.inter_size
+       << "\nsize_per_head: "
+       << model_param_.head_dim
+       //    << "\ninter_size: " << model_param_.inter_size
        << "\nnum_layer: " << model_param_.layer_num << "\nvocab_size: " << model_param_.vocab_size
-       << "\nattn_bias: " << attn_bias_ << "\nmax_batch_size: " << engine_param_.max_batch_size
+       << "\nattn_bias: " << model_param_.attn_bias << "\nmax_batch_size: " << engine_param_.max_batch_size
        << "\nmax_prefill_token_num: " << engine_param_.max_prefill_token_num
        << "\nmax_context_token_num: " << engine_param_.max_context_token_num
        << "\nnum_tokens_per_iter: " << engine_param_.num_tokens_per_iter
@@ -501,8 +506,9 @@ std::string LlamaTritonModel<T>::toString()
        << "\nenable_prefix_caching: " << engine_param_.enable_prefix_caching << "\nstart_id: " << model_param_.start_id
        << "\ntensor_para_size: " << tensor_para_size_ << "\npipeline_para_size: " << pipeline_para_size_
        << "\nenable_custom_all_reduce: " << enable_custom_all_reduce_ << "\nmodel_name: " << model_name_
-       << "\nmodel_dir: " << model_dir_ << "\nquant_policy: " << model_param_.quant_policy
-       << "\ngroup_size: " << group_size_ << "\nexpert_num: " << moe_param_.expert_num
+       << "\nmodel_dir: " << model_dir_ << "\nquant_policy: " << model_param_.quant_policy << "\ngroup_size: "
+       << model_param_.group_size
+       //    << "\nexpert_num: " << moe_param_.expert_num
        << "\nexpert_per_token: " << moe_param_.experts_per_token << "\nmoe_method: " << moe_param_.method << std::endl;
 
     return ss.str();
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.h b/src/turbomind/triton_backend/llama/LlamaTritonModel.h
index 19a143e721..a6c1b862ac 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.h
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.h
@@ -91,9 +91,6 @@ struct LlamaTritonModel: public AbstractTransformerModel {
     ft::EngineParam    engine_param_;
     size_t             tensor_para_size_;
     size_t             pipeline_para_size_;
-    ft::WeightType     weight_type_;
-    bool               attn_bias_;
-    int                group_size_;
 
     std::shared_ptr<ft::SharedState> shared_state_;
     // Weights & engine instances for the ranks
diff --git a/src/turbomind/utils/allocator.h b/src/turbomind/utils/allocator.h
index bdcb9bfc46..88c299c3de 100644
--- a/src/turbomind/utils/allocator.h
+++ b/src/turbomind/utils/allocator.h
@@ -281,7 +281,8 @@ class Allocator<AllocatorType::CUDA>: public IAllocator {
                 pointer_mapping_.erase(address);
             }
             else {
-                TM_LOG_WARNING("pointer_mapping_ does not have information of ptr at %p.", address);
+                FT_CHECK_WITH_INFO(0,
+                                   fmtstr("pointer_mapping_ does not have information of ptr at %p.", address).c_str());
             }
         }
         *ptr = nullptr;
diff --git a/src/turbomind/utils/cuda_utils.h b/src/turbomind/utils/cuda_utils.h
index 2148fcc164..8311e6eb9e 100644
--- a/src/turbomind/utils/cuda_utils.h
+++ b/src/turbomind/utils/cuda_utils.h
@@ -483,5 +483,24 @@ void compareTwoTensor(
 
 bool is_16xx_series(const char* name);
 
+class CudaDeviceGuard {
+public:
+    CudaDeviceGuard(int device)
+    {
+        cudaGetDevice(&last_device_id_);
+        if (device != last_device_id_) {
+            cudaSetDevice(device);
+        }
+    }
+
+    ~CudaDeviceGuard()
+    {
+        cudaSetDevice(last_device_id_);
+    }
+
+private:
+    int last_device_id_{-1};
+};
+
 /* ************************** end of common utils ************************** */
 }  // namespace turbomind
diff --git a/src/turbomind/utils/memory_utils.cu b/src/turbomind/utils/memory_utils.cu
index f8bfb8efe0..e9a79ea5a1 100644
--- a/src/turbomind/utils/memory_utils.cu
+++ b/src/turbomind/utils/memory_utils.cu
@@ -26,77 +26,71 @@
 namespace turbomind {
 
 template<typename T>
-void deviceMalloc(T** ptr, size_t size, bool is_random_initialize)
+void deviceMalloc(T** ptr, size_t size, cudaStream_t st, bool is_random_initialize)
 {
-    FT_CHECK_WITH_INFO(size >= ((size_t)0), "Ask deviceMalloc size " + std::to_string(size) + "< 0 is invalid.");
-    check_cuda_error(cudaMalloc((void**)(ptr), sizeof(T) * size));
+    check_cuda_error(cudaMallocAsync((void**)(ptr), sizeof(T) * size, st));
     if (is_random_initialize) {
-        cudaRandomUniform(*ptr, size);
+        cudaRandomUniform(*ptr, size, st);
     }
 }
 
-template void deviceMalloc(float** ptr, size_t size, bool is_random_initialize);
-template void deviceMalloc(half** ptr, size_t size, bool is_random_initialize);
+template void deviceMalloc(float** ptr, size_t size, cudaStream_t, bool is_random_initialize);
+template void deviceMalloc(half** ptr, size_t size, cudaStream_t, bool is_random_initialize);
 #ifdef ENABLE_BF16
-template void deviceMalloc(__nv_bfloat16** ptr, size_t size, bool is_random_initialize);
+template void deviceMalloc(__nv_bfloat16** ptr, size_t size, cudaStream_t, bool is_random_initialize);
 #endif
-template void deviceMalloc(uint16_t** ptr, size_t size, bool is_random_initialize);
-template void deviceMalloc(int** ptr, size_t size, bool is_random_initialize);
-template void deviceMalloc(bool** ptr, size_t size, bool is_random_initialize);
-template void deviceMalloc(char** ptr, size_t size, bool is_random_initialize);
-template void deviceMalloc(int8_t** ptr, size_t size, bool is_random_initialize);
+template void deviceMalloc(uint16_t** ptr, size_t size, cudaStream_t, bool is_random_initialize);
+template void deviceMalloc(int** ptr, size_t size, cudaStream_t, bool is_random_initialize);
+template void deviceMalloc(bool** ptr, size_t size, cudaStream_t, bool is_random_initialize);
+template void deviceMalloc(char** ptr, size_t size, cudaStream_t, bool is_random_initialize);
+template void deviceMalloc(int8_t** ptr, size_t size, cudaStream_t, bool is_random_initialize);
 #ifdef ENABLE_FP8
-template void deviceMalloc(__nv_fp8_e4m3** ptr, size_t size, bool is_random_initialize);
+template void deviceMalloc(__nv_fp8_e4m3** ptr, size_t size, cudaStream_t, bool is_random_initialize);
 #endif
 
 template<typename T>
-void deviceMemSetZero(T* ptr, size_t size)
-{
-    check_cuda_error(cudaMemset(static_cast<void*>(ptr), 0, sizeof(T) * size));
-}
-
-template void deviceMemSetZero(float* ptr, size_t size);
-template void deviceMemSetZero(half* ptr, size_t size);
-template void deviceMemSetZero(int* ptr, size_t size);
-template void deviceMemSetZero(uint32_t* ptr, size_t size);
-template void deviceMemSetZero(bool* ptr, size_t size);
-#ifdef ENABLE_FP8
-template void deviceMemSetZero(__nv_fp8_e4m3* ptr, size_t size);
-#endif
-#ifdef ENABLE_BF16
-template void deviceMemSetZero(__nv_bfloat16* ptr, size_t size);
-#endif
-
-template<typename T>
-void deviceFree(T*& ptr)
+void deviceFree(T*& ptr, cudaStream_t st)
 {
     if (ptr != NULL) {
-        check_cuda_error(cudaFree(ptr));
+        check_cuda_error(cudaFreeAsync(ptr, st));
         ptr = NULL;
     }
 }
 
-template void deviceFree(float*& ptr);
-template void deviceFree(half*& ptr);
+template void deviceFree(float*& ptr, cudaStream_t);
+template void deviceFree(half*& ptr, cudaStream_t);
 #ifdef ENABLE_BF16
-template void deviceFree(__nv_bfloat16*& ptr);
+template void deviceFree(__nv_bfloat16*& ptr, cudaStream_t);
 #endif
-template void deviceFree(unsigned short*& ptr);
-template void deviceFree(int*& ptr);
-template void deviceFree(bool*& ptr);
-template void deviceFree(char*& ptr);
-template void deviceFree(int8_t*& ptr);
+template void deviceFree(unsigned short*& ptr, cudaStream_t);
+template void deviceFree(int*& ptr, cudaStream_t);
+template void deviceFree(bool*& ptr, cudaStream_t);
+template void deviceFree(char*& ptr, cudaStream_t);
+template void deviceFree(int8_t*& ptr, cudaStream_t);
+template void deviceFree(void*& ptr, cudaStream_t);
 #ifdef ENABLE_FP8
-template void deviceFree(__nv_fp8_e4m3*& ptr);
+template void deviceFree(__nv_fp8_e4m3*& ptr, cudaStream_t);
 #endif
 
+namespace {
+
+template<class T>
+__global__ void fill_kernel(T* devptr, size_t size, T value)
+{
+    const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    for (size_t i = idx; i < size; i += blockDim.x * gridDim.x) {
+        devptr[i] = value;
+    }
+}
+
+}  // namespace
+
 template<typename T>
 void deviceFill(T* devptr, size_t size, T value, cudaStream_t stream)
 {
-    T* arr = new T[size];
-    std::fill(arr, arr + size, value);
-    check_cuda_error(cudaMemcpyAsync(devptr, arr, sizeof(T) * size, cudaMemcpyHostToDevice, stream));
-    delete[] arr;
+    constexpr int threads = 512;
+    const int     blocks  = (size + threads - 1) / threads;
+    fill_kernel<<<blocks, threads, 0, stream>>>(devptr, size, value);
 }
 
 template void deviceFill(float* devptr, size_t size, float value, cudaStream_t stream);
@@ -280,23 +274,23 @@ __global__ void cuda_random_uniform_kernel<char>(char* buffer, const size_t size
 }
 
 template<typename T>
-void cudaRandomUniform(T* buffer, const size_t size)
+void cudaRandomUniform(T* buffer, const size_t size, cudaStream_t st)
 {
     static int seq_offset = 0;
-    cuda_random_uniform_kernel<T><<<256, 256>>>(buffer, size, seq_offset);
+    cuda_random_uniform_kernel<T><<<256, 256, 0, st>>>(buffer, size, seq_offset);
     seq_offset += 256 * 256;
 }
 
-template void cudaRandomUniform(float* buffer, const size_t size);
-template void cudaRandomUniform(half* buffer, const size_t size);
+template void cudaRandomUniform(float* buffer, const size_t size, cudaStream_t);
+template void cudaRandomUniform(half* buffer, const size_t size, cudaStream_t);
 #ifdef ENABLE_BF16
-template void cudaRandomUniform(__nv_bfloat16* buffer, const size_t size);
+template void cudaRandomUniform(__nv_bfloat16* buffer, const size_t size, cudaStream_t);
 #endif
-template void cudaRandomUniform(int* buffer, const size_t size);
-template void cudaRandomUniform(bool* buffer, const size_t size);
-template void cudaRandomUniform(char* buffer, const size_t size);
+template void cudaRandomUniform(int* buffer, const size_t size, cudaStream_t);
+template void cudaRandomUniform(bool* buffer, const size_t size, cudaStream_t);
+template void cudaRandomUniform(char* buffer, const size_t size, cudaStream_t);
 #ifdef ENABLE_FP8
-template void cudaRandomUniform(__nv_fp8_e4m3* buffer, const size_t size);
+template void cudaRandomUniform(__nv_fp8_e4m3* buffer, const size_t size, cudaStream_t);
 #endif
 
 // loads data from binary file. If it succeeds, returns a non-empty vector. If loading fails or
@@ -366,10 +360,10 @@ int loadWeightFromBinFunc(T* ptr, std::vector<size_t> shape, std::string filenam
     }
     else {
         T_IN* ptr_2 = nullptr;
-        deviceMalloc(&ptr_2, host_array.size(), false);
+        deviceMalloc(&ptr_2, host_array.size(), nullptr, false);
         cudaH2Dcpy(ptr_2, host_array.data(), host_array.size());
         invokeCudaD2DcpyConvert(ptr, ptr_2, host_array.size());
-        deviceFree(ptr_2);
+        deviceFree(ptr_2, nullptr);
     }
     return 0;
 }
diff --git a/src/turbomind/utils/memory_utils.h b/src/turbomind/utils/memory_utils.h
index bb7a4f9c03..03a0ef7b33 100644
--- a/src/turbomind/utils/memory_utils.h
+++ b/src/turbomind/utils/memory_utils.h
@@ -23,16 +23,13 @@
 namespace turbomind {
 
 template<typename T>
-void deviceMalloc(T** ptr, size_t size, bool is_random_initialize = true);
+void deviceMalloc(T** ptr, size_t size, cudaStream_t st, bool is_random_initialize = false);
 
 template<typename T>
-void deviceMemSetZero(T* ptr, size_t size);
+void deviceFree(T*& ptr, cudaStream_t st);
 
 template<typename T>
-void deviceFree(T*& ptr);
-
-template<typename T>
-void deviceFill(T* devptr, size_t size, T value, cudaStream_t stream = 0);
+void deviceFill(T* devptr, size_t size, T value, cudaStream_t stream = {});
 
 template<typename T>
 void cudaD2Hcpy(T* tgt, const T* src, const size_t size);
@@ -44,10 +41,10 @@ template<typename T>
 void cudaD2Dcpy(T* tgt, const T* src, const size_t size);
 
 template<typename T>
-void cudaAutoCpy(T* tgt, const T* src, const size_t size, cudaStream_t stream = NULL);
+void cudaAutoCpy(T* tgt, const T* src, const size_t size, cudaStream_t stream = {});
 
 template<typename T>
-void cudaRandomUniform(T* buffer, const size_t size);
+void cudaRandomUniform(T* buffer, const size_t size, cudaStream_t stream = {});
 
 template<typename T>
 int loadWeightFromBin(T*                  ptr,

From 01f82e09c11b6866b8ebe862de2595ebe87e9733 Mon Sep 17 00:00:00 2001
From: zhabuye <74179177+zhabuye@users.noreply.github.com>
Date: Fri, 29 Nov 2024 16:37:29 +0800
Subject: [PATCH 25/40] Add Ascend installation adapter (#2817)

---
 requirements/runtime_ascend.txt | 22 ++++++++++++++++++++++
 requirements_ascend.txt         |  4 ++++
 setup.py                        | 22 ++++++++++++++++++----
 3 files changed, 44 insertions(+), 4 deletions(-)
 create mode 100644 requirements/runtime_ascend.txt
 create mode 100644 requirements_ascend.txt

diff --git a/requirements/runtime_ascend.txt b/requirements/runtime_ascend.txt
new file mode 100644
index 0000000000..d87748e396
--- /dev/null
+++ b/requirements/runtime_ascend.txt
@@ -0,0 +1,22 @@
+accelerate>=0.29.3
+dlinfer-ascend
+einops
+fastapi
+fire
+mmengine-lite
+numpy<2.0.0
+openai
+outlines<0.1.0
+peft<=0.11.1
+pillow
+protobuf
+pydantic>2.0.0
+pynvml
+safetensors
+sentencepiece
+shortuuid
+tiktoken
+torch<=2.4.0,>=2.0.0
+torchvision<=0.19.0,>=0.15.0
+transformers
+uvicorn
diff --git a/requirements_ascend.txt b/requirements_ascend.txt
new file mode 100644
index 0000000000..e844853ab4
--- /dev/null
+++ b/requirements_ascend.txt
@@ -0,0 +1,4 @@
+-r requirements/build.txt
+-r requirements/runtime_ascend.txt
+-r requirements/lite.txt
+-r requirements/serve.txt
diff --git a/setup.py b/setup.py
index 32a69c600c..7a08ac7919 100644
--- a/setup.py
+++ b/setup.py
@@ -4,6 +4,14 @@
 
 from setuptools import find_packages, setup
 
+npu_available = False
+try:
+    import torch_npu
+
+    npu_available = torch_npu.npu.is_available()
+except ImportError:
+    pass
+
 pwd = os.path.dirname(__file__)
 version_file = 'lmdeploy/version.py'
 
@@ -145,11 +153,17 @@ def gen_packages_items():
         include_package_data=True,
         setup_requires=parse_requirements('requirements/build.txt'),
         tests_require=parse_requirements('requirements/test.txt'),
-        install_requires=parse_requirements('requirements/runtime.txt'),
+        install_requires=parse_requirements(
+            'requirements/runtime_ascend.txt'
+            if npu_available else 'requirements/runtime.txt'),
         extras_require={
-            'all': parse_requirements('requirements.txt'),
-            'lite': parse_requirements('requirements/lite.txt'),
-            'serve': parse_requirements('requirements/serve.txt')
+            'all':
+            parse_requirements('requirements_ascend.txt'
+                               if npu_available else 'requirements.txt'),
+            'lite':
+            parse_requirements('requirements/lite.txt'),
+            'serve':
+            parse_requirements('requirements/serve.txt')
         },
         has_ext_modules=check_ext_modules,
         classifiers=[

From 0b6dd1f23aa9b2239fc6d9c24314ee25bec3990c Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
Date: Fri, 29 Nov 2024 17:54:46 +0800
Subject: [PATCH 26/40] [CI] add more testcase for mllm models (#2791)

* update

* update

* update

* update

* update

* update

* update

* update

* update
---
 autotest/config-v100.yaml                     |  16 +-
 autotest/config.yaml                          |  20 +-
 .../test_pipeline_chat_pytorch_llm.py         |   2 -
 .../test_pipeline_chat_pytorch_mllm.py        |   4 -
 .../test_pipeline_chat_turbomind_llm.py       |   2 -
 .../test_pipeline_chat_turbomind_mllm.py      |   4 -
 .../test_restful_chat_hf_pytorch_llm.py       |   3 +-
 .../test_restful_chat_hf_pytorch_mllm.py      |   3 +-
 .../test_restful_chat_hf_turbomind_llm.py     |   3 +-
 .../test_restful_chat_hf_turbomind_mllm.py    |   3 +-
 autotest/utils/pipeline_chat.py               | 348 ++++++++++++++++++
 autotest/utils/run_restful_chat.py            |  15 +-
 docs/en/supported_models/supported_models.md  |   4 +-
 .../supported_models/supported_models.md      |   4 +-
 14 files changed, 401 insertions(+), 30 deletions(-)

diff --git a/autotest/config-v100.yaml b/autotest/config-v100.yaml
index 41216cb730..507f81ceb6 100644
--- a/autotest/config-v100.yaml
+++ b/autotest/config-v100.yaml
@@ -1,4 +1,5 @@
 model_path: /nvme/qa_test_models
+resource_path: /nvme/qa_test_models/resource
 dst_path: /nvme/qa_test_models/autotest_model
 log_path: /nvme/qa_test_models/autotest_model/log
 benchmark_path: /nvme/qa_test_models/benchmark-reports
@@ -100,12 +101,22 @@ turbomind_quatization:
         - meta-llama/Meta-Llama-3-8B-Instruct
         - internlm/internlm-xcomposer2d5-7b
         - OpenGVLab/Mini-InternVL-Chat-2B-V1-5
+        - Qwen/Qwen2-VL-2B-Instruct
+        - Qwen/Qwen2-VL-7B-Instruct
         - mistralai/Mistral-7B-Instruct-v0.3
         - THUDM/glm-4-9b-chat
+        - deepseek-ai/deepseek-coder-1.3b-instruct
+        - codellama/CodeLlama-7b-Instruct-hf
     gptq:
         - internlm/internlm2_5-7b-chat
     no_kvint4:
         - openbmb/MiniCPM-V-2_6
+        - Qwen/Qwen2-7B-Instruct
+        - Qwen/Qwen2-7B-Instruct-AWQ
+        - Qwen/Qwen2-1.5B-Instruct
+        - Qwen/Qwen2.5-0.5B-Instruct
+        - Qwen/Qwen2.5-7B-Instruct
+        - Qwen/Qwen2-7B-Instruct-GPTQ-Int4
     no_kvint8:
         - deepseek-ai/DeepSeek-V2-Lite-Chat
 
@@ -120,6 +131,10 @@ pytorch_quatization:
     no_kvint4:
         - OpenGVLab/InternVL2-1B
         - OpenGVLab/InternVL2-4B
+        - Qwen/Qwen2-7B-Instruct
+        - Qwen/Qwen2-1.5B-Instruct
+        - Qwen/Qwen2-VL-2B-Instruct
+        - Qwen/Qwen2-VL-7B-Instruct
         - deepseek-ai/DeepSeek-V2-Lite-Chat
         - microsoft/Phi-3-mini-4k-instruct
         - microsoft/Phi-3-vision-128k-instruct
@@ -128,7 +143,6 @@ pytorch_quatization:
     no_kvint8:
         - deepseek-ai/DeepSeek-V2-Lite-Chat
 
-
 longtext_model:
     - meta-llama/Meta-Llama-3-1-8B-Instruct
     - meta-llama/Meta-Llama-3-8B-Instruct
diff --git a/autotest/config.yaml b/autotest/config.yaml
index 88ca7c3127..b4fd4e1712 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -1,4 +1,5 @@
 model_path: /nvme/qa_test_models
+resource_path: /nvme/qa_test_models/resource
 dst_path: /nvme/qa_test_models/autotest_model
 log_path: /nvme/qa_test_models/autotest_model/log
 benchmark_path: /nvme/qa_test_models/benchmark-reports
@@ -18,6 +19,7 @@ tp_config:
     Qwen2-7B-Instruct-GPTQ-Int4: 2
     InternVL2-40B: 2
     MiniCPM-V-2_6: 2
+    Qwen2.5-72B-Instruct: 4
 
 turbomind_chat_model:
     - meta-llama/Llama-3.2-1B-Instruct
@@ -164,7 +166,11 @@ pytorch_base_model:
 
 turbomind_quatization:
     no_awq:
+        - Qwen/Qwen1.5-MoE-A2.7B-Chat
+        - Qwen/Qwen2-VL-2B-Instruct
+        - Qwen/Qwen2-VL-7B-Instruct
         - mistralai/Mistral-7B-Instruct-v0.3
+        - mistralai/Mistral-Nemo-Instruct-2407
         - deepseek-ai/deepseek-coder-1.3b-instruct
         - deepseek-ai/DeepSeek-V2-Lite-Chat
         - codellama/CodeLlama-7b-Instruct-hf
@@ -172,6 +178,12 @@ turbomind_quatization:
         - internlm/internlm2_5-7b-chat
     no_kvint4:
         - openbmb/MiniCPM-V-2_6
+        - Qwen/Qwen2-7B-Instruct
+        - Qwen/Qwen2-7B-Instruct-AWQ
+        - Qwen/Qwen2-1.5B-Instruct
+        - Qwen/Qwen2.5-0.5B-Instruct
+        - Qwen/Qwen2.5-7B-Instruct
+        - Qwen/Qwen2-7B-Instruct-GPTQ-Int4
     no_kvint8:
         - deepseek-ai/DeepSeek-V2-Lite-Chat
 
@@ -203,6 +215,10 @@ pytorch_quatization:
     no_kvint4:
         - OpenGVLab/InternVL2-1B
         - OpenGVLab/InternVL2-4B
+        - Qwen/Qwen2-7B-Instruct
+        - Qwen/Qwen2-1.5B-Instruct
+        - Qwen/Qwen2-VL-2B-Instruct
+        - Qwen/Qwen2-VL-7B-Instruct
         - deepseek-ai/DeepSeek-V2-Lite-Chat
         - microsoft/Phi-3-mini-4k-instruct
         - microsoft/Phi-3-vision-128k-instruct
@@ -211,7 +227,6 @@ pytorch_quatization:
     no_kvint8:
         - deepseek-ai/DeepSeek-V2-Lite-Chat
 
-
 longtext_model:
     - meta-llama/Meta-Llama-3-1-8B-Instruct
     - meta-llama/Meta-Llama-3-8B-Instruct
@@ -227,7 +242,8 @@ benchmark_model:
     - internlm/internlm2_5-7b-chat
     - internlm/internlm2_5-20b-chat
     - THUDM/glm-4-9b-chat
-    - Qwen/Qwen2-7B-Instruct
+    - Qwen/Qwen2.5-7B-Instruct
+    - Qwen/Qwen2.5-72B-Instruct
     - mistralai/Mistral-7B-Instruct-v0.3
     - mistralai/Mixtral-8x7B-Instruct-v0.1
     - deepseek-ai/DeepSeek-V2-Lite-Chat
diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py
index a828e17a09..58674fa173 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py
@@ -67,8 +67,6 @@ def test_pipeline_chat_pytorch_tp2(config, common_case_config, model,
                                               exclude_dup=True))
 def test_pipeline_chat_kvint4_tp1(config, common_case_config, model,
                                   worker_id):
-    if 'Qwen2' in model:
-        return  # kvint4 for qwen2 is not support
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
     spawn_context = get_context('spawn')
diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py
index 276ced5bcb..8403ced94f 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py
@@ -50,8 +50,6 @@ def test_pipeline_chat_tp2(config, model, worker_id):
                                               quant_policy=4,
                                               model_type='vl_model'))
 def test_pipeline_chat_kvint4_tp1(config, model, worker_id):
-    if 'Qwen2' in model:
-        return  # kvint4 for qwen2 is not support
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
     spawn_context = get_context('spawn')
@@ -70,8 +68,6 @@ def test_pipeline_chat_kvint4_tp1(config, model, worker_id):
                                               quant_policy=4,
                                               model_type='vl_model'))
 def test_pipeline_chat_kvint4_tp2(config, model, worker_id):
-    if 'Qwen2' in model:
-        return  # kvint4 for qwen2 is not support
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id,
                                                                      tp_num=2)
diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py
index 17560e754d..d1865175cf 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py
@@ -56,8 +56,6 @@ def test_pipeline_chat_tp2(config, common_case_config, model, worker_id):
 @pytest.mark.parametrize('model', get_all_model_list(tp_num=1, quant_policy=4))
 def test_pipeline_chat_kvint4_tp1(config, common_case_config, model,
                                   worker_id):
-    if 'Qwen2' in model:
-        return  # kvint4 for qwen2 is not support
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
     spawn_context = get_context('spawn')
diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
index 8f1bc7d8b1..8c845fa77a 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
@@ -50,8 +50,6 @@ def test_pipeline_chat_tp2(config, model, worker_id):
                                             quant_policy=4,
                                             model_type='vl_model'))
 def test_pipeline_chat_kvint4_tp1(config, model, worker_id):
-    if 'Qwen2' in model:
-        return  # kvint4 for qwen2 is not support
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
     spawn_context = get_context('spawn')
@@ -70,8 +68,6 @@ def test_pipeline_chat_kvint4_tp1(config, model, worker_id):
                                             quant_policy=4,
                                             model_type='vl_model'))
 def test_pipeline_chat_kvint4_tp2(config, model, worker_id):
-    if 'Qwen2' in model:
-        return  # kvint4 for qwen2 is not support
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id,
                                                                      tp_num=2)
diff --git a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py
index ab1f5595ae..fc95e288ca 100644
--- a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py
+++ b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py
@@ -67,8 +67,7 @@ def getKvintModelList(tp_num, quant_policy):
         'tp_num': tp_num,
         'extra': f'--quant-policy {quant_policy}'
     } for item in get_torch_model_list(
-        tp_num, quant_policy=quant_policy, exclude_dup=True)
-            if 'qwen2' not in item.lower() or quant_policy == 8]
+        tp_num, quant_policy=quant_policy, exclude_dup=True)]
 
 
 @pytest.mark.order(7)
diff --git a/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py b/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py
index b210733db4..bf20c45e6e 100644
--- a/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py
+++ b/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py
@@ -60,8 +60,7 @@ def getKvintModelList(tp_num, quant_policy: int = None):
         'tp_num': tp_num,
         'extra': f'--quant-policy {quant_policy}'
     } for item in get_torch_model_list(
-        tp_num, quant_policy=quant_policy, model_type='vl_model')
-            if 'qwen2' not in item.lower() or quant_policy == 8]
+        tp_num, quant_policy=quant_policy, model_type='vl_model')]
 
 
 @pytest.mark.order(7)
diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py
index 91e65ee51a..1c9131b32e 100644
--- a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py
+++ b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py
@@ -66,8 +66,7 @@ def getKvintModelList(tp_num, quant_policy):
         'cuda_prefix': None,
         'tp_num': tp_num,
         'extra': f'--quant-policy {quant_policy}'
-    } for item in get_all_model_list(tp_num, quant_policy=quant_policy)
-            if 'qwen2' not in item.lower() or quant_policy == 8]
+    } for item in get_all_model_list(tp_num, quant_policy=quant_policy)]
 
 
 @pytest.mark.order(7)
diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py
index 091e18e6e3..641f2f760f 100644
--- a/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py
+++ b/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py
@@ -60,8 +60,7 @@ def getKvintModelList(tp_num, quant_policy: int = None):
         'tp_num': tp_num,
         'extra': f'--quant-policy {quant_policy}'
     } for item in get_all_model_list(
-        tp_num, quant_policy=quant_policy, model_type='vl_model')
-            if 'qwen2' not in item.lower() or quant_policy == 8]
+        tp_num, quant_policy=quant_policy, model_type='vl_model')]
 
 
 @pytest.mark.order(7)
diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py
index 562a707efe..023e4ac142 100644
--- a/autotest/utils/pipeline_chat.py
+++ b/autotest/utils/pipeline_chat.py
@@ -3,7 +3,10 @@
 from subprocess import PIPE
 
 import allure
+import numpy as np
 import torch
+from decord import VideoReader, cpu
+from PIL import Image
 from pytest_assume.plugin import assume
 from utils.get_run_config import get_model_name, get_tp_num
 from utils.rule_condition_assert import assert_result
@@ -13,6 +16,7 @@
 from lmdeploy.utils import is_bf16_supported
 from lmdeploy.vl import load_image
 from lmdeploy.vl.constants import IMAGE_TOKEN
+from lmdeploy.vl.utils import encode_image_base64
 
 
 def run_pipeline_chat_test(config,
@@ -275,6 +279,12 @@ def assert_pipeline_single_element(output,
 
 PIC1 = 'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg'  # noqa E501
 PIC2 = 'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg'  # noqa E501
+PIC_BEIJING = 'https://raw.githubusercontent.com/QwenLM/Qwen-VL/master/assets/mm_tutorial/Beijing_Small.jpeg'  # noqa E501
+PIC_CHONGQING = 'https://raw.githubusercontent.com/QwenLM/Qwen-VL/master/assets/mm_tutorial/Chongqing_Small.jpeg'  # noqa E501
+PIC_REDPANDA = 'https://raw.githubusercontent.com/OpenGVLab/InternVL/main/internvl_chat/examples/image1.jpg'  # noqa E501
+PIC_PANDA = 'https://raw.githubusercontent.com/OpenGVLab/InternVL/main/internvl_chat/examples/image2.jpg'  # noqa E501
+DESC = 'What are the similarities and differences between these two images.'  # noqa E501
+DESC_ZH = '两张图有什么相同和不同的地方.'  # noqa E501
 
 
 def run_pipeline_vl_chat_test(config,
@@ -386,12 +396,350 @@ def run_pipeline_vl_chat_test(config,
                     ', reason: Multi-turn example: ski not in ' +
                     sess.response.text + '\n')
 
+    if 'internvl' in model_case.lower():
+        internvl_vl_testcase(config, pipe, file)
+        internvl_vl_testcase(config, pipe, file, 'cn')
+    if 'minicpm' in model_case.lower():
+        MiniCPM_vl_testcase(config, pipe, file)
+    if 'qwen' in model_case.lower():
+        Qwen_vl_testcase(config, pipe, file)
+
     file.close()
 
     del pipe
     torch.cuda.empty_cache()
 
 
+def internvl_vl_testcase(config, pipe, file, lang='en'):
+    if lang == 'cn':
+        description = DESC_ZH
+    else:
+        description = DESC
+    # multi-image multi-round conversation, combined images
+    messages = [
+        dict(role='user',
+             content=[
+                 dict(type='text',
+                      text=f'{IMAGE_TOKEN}{IMAGE_TOKEN}\n{description}'),
+                 dict(type='image_url',
+                      image_url=dict(max_dynamic_patch=12, url=PIC_REDPANDA)),
+                 dict(type='image_url',
+                      image_url=dict(max_dynamic_patch=12, url=PIC_PANDA))
+             ])
+    ]
+    response = pipe(messages)
+    result = 'panda' in response.text.lower() or '熊猫' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: combined images: panda not in ' +
+                    response.text + '\n')
+
+    messages.append(dict(role='assistant', content=response.text))
+    messages.append(dict(role='user', content=description))
+    response = pipe(messages)
+    result = 'panda' in response.text.lower() or '熊猫' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: combined images second: panda not in ' +
+                    response.text + '\n')
+
+    # multi-image multi-round conversation, separate images
+    messages = [
+        dict(
+            role='user',
+            content=[
+                dict(
+                    type='text',
+                    text=f'Image-1: {IMAGE_TOKEN}\nImage-2: {IMAGE_TOKEN}\n'
+                    +  # noqa E251,E501
+                    description),
+                dict(type='image_url',
+                     image_url=dict(max_dynamic_patch=12, url=PIC_REDPANDA)),
+                dict(type='image_url',
+                     image_url=dict(max_dynamic_patch=12, url=PIC_PANDA))
+            ])
+    ]
+    response = pipe(messages)
+    result = 'panda' in response.text.lower() or '熊猫' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: separate images: panda not in ' +
+                    response.text + '\n')
+
+    messages.append(dict(role='assistant', content=response.text))
+    messages.append(dict(role='user', content=description))
+    response = pipe(messages)
+    result = 'panda' in response.text.lower() or '熊猫' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: separate images second: panda not in ' +
+                    response.text + '\n')
+
+    # video multi-round conversation
+    def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
+        if bound:
+            start, end = bound[0], bound[1]
+        else:
+            start, end = -100000, 100000
+        start_idx = max(first_idx, round(start * fps))
+        end_idx = min(round(end * fps), max_frame)
+        seg_size = float(end_idx - start_idx) / num_segments
+        frame_indices = np.array([
+            int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
+            for idx in range(num_segments)
+        ])
+        return frame_indices
+
+    def load_video(video_path, bound=None, num_segments=32):
+        vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+        max_frame = len(vr) - 1
+        fps = float(vr.get_avg_fps())
+        frame_indices = get_index(bound,
+                                  fps,
+                                  max_frame,
+                                  first_idx=0,
+                                  num_segments=num_segments)
+        imgs = []
+        for frame_index in frame_indices:
+            img = Image.fromarray(vr[frame_index].asnumpy()).convert('RGB')
+            imgs.append(img)
+        return imgs
+
+    resource_path = config.get('resource_path')
+    video_path = resource_path + '/red-panda.mp4'
+    imgs = load_video(video_path, num_segments=8)
+
+    question = ''
+    for i in range(len(imgs)):
+        question = question + f'Frame{i+1}: {IMAGE_TOKEN}\n'
+
+    if lang == 'cn':
+        question += '小熊猫在做什么？'
+    else:
+        question += 'What is the red panda doing?'
+
+    content = [{'type': 'text', 'text': question}]
+    for img in imgs:
+        content.append({
+            'type': 'image_url',
+            'image_url': {
+                'max_dynamic_patch': 1,
+                'url': f'data:image/jpeg;base64,{encode_image_base64(img)}'
+            }
+        })
+
+    messages = [dict(role='user', content=content)]
+    response = pipe(messages)
+    result = 'panda' in response.text.lower() or '熊猫' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: video images: red panda not in ' +
+                    response.text + '\n')
+
+    messages.append(dict(role='assistant', content=response.text))
+    if lang == 'cn':
+        messages.append(dict(role='user', content='描述视频详情，不要重复'))
+    else:
+        messages.append(
+            dict(role='user',
+                 content='Describe this video in detail. Don\'t repeat.'))
+    response = pipe(messages)
+    result = 'red panda' in response.text.lower(
+    ) or '熊猫' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: video images: red panda not in ' +
+                    response.text + '\n')
+
+
+def llava_vl_testcase(config, pipe, file):
+    # multi-image multi-round conversation, combined images
+    messages = [
+        dict(role='user',
+             content=[
+                 dict(type='text', text='Describe the two images in detail.'),
+                 dict(type='image_url', image_url=dict(url=PIC_BEIJING)),
+                 dict(type='image_url', image_url=dict(url=PIC_CHONGQING))
+             ])
+    ]
+    response = pipe(messages)
+    result = 'buildings' in response.text.lower(
+    ) or '楼' in response.text.lower() or 'skyline' in response.text.lower(
+    ) or 'cityscape' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: combined images: buildings not in ' +
+                    response.text + '\n')
+
+    messages.append(dict(role='assistant', content=response.text))
+    messages.append(dict(role='user', content=DESC))
+    response = pipe(messages)
+    result = 'buildings' in response.text.lower(
+    ) or '楼' in response.text.lower() or 'skyline' in response.text.lower(
+    ) or 'cityscape' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: combined images second: buildings not in ' +
+                    response.text + '\n')
+
+
+def MiniCPM_vl_testcase(config, pipe, file):
+    # Chat with multiple images
+    messages = [
+        dict(role='user',
+             content=[
+                 dict(type='text', text='Describe the two images in detail.'),
+                 dict(type='image_url',
+                      image_url=dict(max_slice_nums=9, url=PIC_REDPANDA)),
+                 dict(type='image_url',
+                      image_url=dict(max_slice_nums=9, url=PIC_PANDA))
+             ])
+    ]
+    response = pipe(messages)
+    result = 'panda' in response.text.lower() or '熊猫' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: multiple images: panda not in ' +
+                    response.text + '\n')
+
+    messages.append(dict(role='assistant', content=response.text))
+    messages.append(dict(role='user', content=DESC))
+    response = pipe(messages)
+    result = 'panda' in response.text.lower() or '熊猫' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: multiple images second: panda not in ' +
+                    response.text + '\n')
+
+    # In-context few-shot learning
+    EXAMPLE1 = 'https://github.com/user-attachments/assets/405d9147-95f6-4f78-8879-606a0aed6707'  # noqa E251,E501
+    EXAMPLE2 = 'https://github.com/user-attachments/assets/9f2c6ed9-2aa5-4189-9c4f-0b9753024ba1'  # noqa E251,E501
+    EXAMPLE3 = 'https://github.com/user-attachments/assets/f335b507-1957-4c22-84ae-ed69ff79df38'  # noqa E251,E501
+    question = 'production date'
+    messages = [
+        dict(role='user',
+             content=[
+                 dict(type='text', text=question),
+                 dict(type='image_url', image_url=dict(url=EXAMPLE1)),
+             ]),
+        dict(role='assistant', content='2021.08.29'),
+        dict(role='user',
+             content=[
+                 dict(type='text', text=question),
+                 dict(type='image_url', image_url=dict(url=EXAMPLE2)),
+             ]),
+        dict(role='assistant', content='1999.05.15'),
+        dict(role='user',
+             content=[
+                 dict(type='text', text=question),
+                 dict(type='image_url', image_url=dict(url=EXAMPLE3)),
+             ])
+    ]
+    response = pipe(messages)
+    result = '2021' in response.text.lower() or '14' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: in context learning: 2021 or 14 not in ' +
+                    response.text + '\n')
+
+    # Chat with video
+    MAX_NUM_FRAMES = 64  # if cuda OOM set a smaller number
+
+    def encode_video(video_path):
+
+        def uniform_sample(length, n):
+            gap = len(length) / n
+            idxs = [int(i * gap + gap / 2) for i in range(n)]
+            return [length[i] for i in idxs]
+
+        vr = VideoReader(video_path, ctx=cpu(0))
+        sample_fps = round(vr.get_avg_fps() / 1)  # FPS
+        frame_idx = [i for i in range(0, len(vr), sample_fps)]
+        if len(frame_idx) > MAX_NUM_FRAMES:
+            frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
+        frames = vr.get_batch(frame_idx).asnumpy()
+        frames = [Image.fromarray(v.astype('uint8')) for v in frames]
+        print('num frames:', len(frames))
+        return frames
+
+    resource_path = config.get('resource_path')
+    video_path = resource_path + '/red-panda.mp4'
+    frames = encode_video(video_path)
+    question = 'Describe the video'
+
+    content = [dict(type='text', text=question)]
+    for frame in frames:
+        content.append(
+            dict(type='image_url',
+                 image_url=dict(
+                     use_image_id=False,
+                     max_slice_nums=2,
+                     url=f'data:image/jpeg;base64,{encode_image_base64(frame)}'
+                 )))
+
+    messages = [dict(role='user', content=content)]
+    response = pipe(messages)
+    result = 'red panda' in response.text.lower(
+    ) or '熊猫' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: video example: panda not in ' + response.text +
+                    '\n')
+
+
+def Qwen_vl_testcase(config, pipe, file):
+    # multi-image multi-round conversation, combined images
+    messages = [
+        dict(role='user',
+             content=[
+                 dict(type='text', text='Describe the two images in detail.'),
+                 dict(type='image_url', image_url=dict(url=PIC_BEIJING)),
+                 dict(type='image_url', image_url=dict(url=PIC_CHONGQING))
+             ])
+    ]
+    response = pipe(messages)
+    result = 'buildings' in response.text.lower(
+    ) or '楼' in response.text.lower() or 'skyline' in response.text.lower(
+    ) or 'cityscape' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: combined images: buildings not in ' +
+                    response.text + '\n')
+
+    messages.append(dict(role='assistant', content=response.text))
+    messages.append(dict(role='user', content=DESC))
+    response = pipe(messages)
+    result = 'buildings' in response.text.lower(
+    ) or '楼' in response.text.lower() or 'skyline' in response.text.lower(
+    ) or 'cityscape' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: combined images second: buildings not in ' +
+                    response.text + '\n')
+
+    # image resolution for performance boost
+    min_pixels = 64 * 28 * 28
+    max_pixels = 64 * 28 * 28
+    messages = [
+        dict(role='user',
+             content=[
+                 dict(type='text', text='Describe the two images in detail.'),
+                 dict(type='image_url',
+                      image_url=dict(min_pixels=min_pixels,
+                                     max_pixels=max_pixels,
+                                     url=PIC_BEIJING)),
+                 dict(type='image_url',
+                      image_url=dict(min_pixels=min_pixels,
+                                     max_pixels=max_pixels,
+                                     url=PIC_CHONGQING))
+             ])
+    ]
+    response = pipe(messages)
+    result = 'ski' in response.text.lower() or '滑雪' in response.text.lower()
+    result = 'buildings' in response.text.lower(
+    ) or '楼' in response.text.lower() or 'skyline' in response.text.lower(
+    ) or 'cityscape' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: performance boost: buildings not in ' +
+                    response.text + '\n')
+
+    messages.append(dict(role='assistant', content=response.text))
+    messages.append(dict(role='user', content=DESC))
+    response = pipe(messages)
+    result = 'buildings' in response.text.lower(
+    ) or '楼' in response.text.lower() or 'skyline' in response.text.lower(
+    ) or 'cityscape' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: performance boost second: buildings not in ' +
+                    response.text + '\n')
+
+
 def assert_pipeline_vl_chat_log(config, model_case, worker_id):
     log_path = config.get('log_path')
 
diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py
index 77af1975be..082a61bcda 100644
--- a/autotest/utils/run_restful_chat.py
+++ b/autotest/utils/run_restful_chat.py
@@ -282,6 +282,7 @@ def get_model(url):
 
 
 PIC = 'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg'  # noqa E501
+PIC2 = 'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg'  # noqa E501
 
 
 def run_vl_testcase(config, port: int = DEFAULT_PORT):
@@ -307,6 +308,11 @@ def run_vl_testcase(config, port: int = DEFAULT_PORT):
             'image_url': {
                 'url': PIC,
             },
+        }, {
+            'type': 'image_url',
+            'image_url': {
+                'url': PIC2,
+            },
         }],
     }]
 
@@ -315,8 +321,6 @@ def run_vl_testcase(config, port: int = DEFAULT_PORT):
                                               temperature=0.8,
                                               top_p=0.8)
     file.writelines(str(response).lower() + '\n')
-    assert 'tiger' in str(response).lower() or '虎' in str(
-        response).lower(), response
 
     api_client = APIClient(http_url)
     model_name = api_client.available_models[0]
@@ -324,7 +328,12 @@ def run_vl_testcase(config, port: int = DEFAULT_PORT):
                                                messages=prompt_messages):
         continue
     file.writelines(str(item) + '\n')
-    assert 'tiger' in str(item).lower() or '虎' in str(item).lower(), item
 
     allure.attach.file(restful_log,
                        attachment_type=allure.attachment_type.TEXT)
+
+    assert 'tiger' in str(response).lower() or '虎' in str(
+        response).lower() or 'ski' in str(response).lower() or '滑雪' in str(
+            response).lower(), response
+    assert 'tiger' in str(item).lower() or '虎' in str(item).lower(
+    ) or 'ski' in str(item).lower() or '滑雪' in str(item).lower(), item
diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
index da52241253..cd43e79c94 100644
--- a/docs/en/supported_models/supported_models.md
+++ b/docs/en/supported_models/supported_models.md
@@ -19,7 +19,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |         Qwen          |   1.8B - 72B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |        Qwen1.5        |  1.8B - 110B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |         Qwen2         |   0.5B - 72B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Mistral        |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Mistral        |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  No   |
 |        Mixtral        |  8x7B, 8x22B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |        Qwen-VL        |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |      DeepSeek-VL      |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
@@ -36,7 +36,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |    MiniGeminiLlama    |       7B       | MLLM |    Yes    |    -    |    -    |  Yes  |
 |         GLM4          |       9B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |       CodeGeeX4       |       9B       | LLM  |    Yes    |   Yes   |   Yes   |   -   |
-|         Molmo         |    7B-D,72B    | MLLM |    Yes    |   Yes   |   Yes   |  NO   |
+|         Molmo         |    7B-D,72B    | MLLM |    Yes    |   Yes   |   Yes   |  No   |
 
 "-" means not verified yet.
 
diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
index 502e91b6d3..7ec36d2351 100644
--- a/docs/zh_cn/supported_models/supported_models.md
+++ b/docs/zh_cn/supported_models/supported_models.md
@@ -19,7 +19,7 @@
 |         Qwen          |   1.8B - 72B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |        Qwen1.5        |  1.8B - 110B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |         Qwen2         |   0.5B - 72B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Mistral        |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Mistral        |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  No   |
 |        Mixtral        |  8x7B, 8x22B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |        Qwen-VL        |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |      DeepSeek-VL      |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
@@ -36,7 +36,7 @@
 |    MiniGeminiLlama    |       7B       | MLLM |    Yes    |    -    |    -    |  Yes  |
 |         GLM4          |       9B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |       CodeGeeX4       |       9B       | LLM  |    Yes    |   Yes   |   Yes   |   -   |
-|         Molmo         |    7B-D,72B    | MLLM |    Yes    |   Yes   |   Yes   |  NO   |
+|         Molmo         |    7B-D,72B    | MLLM |    Yes    |   Yes   |   Yes   |  No   |
 
 “-” 表示还没有验证。
 

From 4ede6314aac338e3b141fe9c909233421d7b636f Mon Sep 17 00:00:00 2001
From: Li Zhang <lzhang329@gmail.com>
Date: Fri, 29 Nov 2024 18:43:46 +0800
Subject: [PATCH 27/40] refactor turbomind (2/N) (#2818)

---
 CMakeLists.txt                                |   2 +-
 lmdeploy/turbomind/turbomind.py               |   8 +-
 src/turbomind/models/llama/LlamaBatch.h       |   4 +-
 src/turbomind/models/llama/LlamaV2.h          |   6 +-
 src/turbomind/python/bind.cpp                 | 214 ++++++-------
 .../triton_backend/llama/LlamaTritonModel.cc  | 165 +++++-----
 .../triton_backend/llama/LlamaTritonModel.h   |  64 ++--
 .../llama/LlamaTritonModelInstance.cc         | 206 +++++--------
 .../llama/LlamaTritonModelInstance.h          |  36 +--
 .../transformer_triton_backend.cpp            |  52 ++--
 .../transformer_triton_backend.hpp            | 283 ++----------------
 src/turbomind/utils/Tensor.h                  |  10 +
 src/turbomind/utils/instance_comm.h           |  16 -
 13 files changed, 370 insertions(+), 696 deletions(-)
 delete mode 100644 src/turbomind/utils/instance_comm.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ff2ac7dded..356da56f58 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -304,7 +304,7 @@ link_directories(
 
 # add_subdirectory(3rdparty)
 add_subdirectory(src)
-add_subdirectory(examples)
+# add_subdirectory(examples)
 
 if(BUILD_TEST)
     add_subdirectory(tests/csrc)
diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py
index 05bc3e400e..a1b2fff944 100644
--- a/lmdeploy/turbomind/turbomind.py
+++ b/lmdeploy/turbomind/turbomind.py
@@ -358,12 +358,10 @@ def _forward_callback(self, result, ctx):
         self.que.put((False, result))
 
     def _forward_thread(self, inputs):
-        instance_comm = self.tm_model.model_comm.create_instance_comm(
-            self.gpu_count)
 
         def _func():
             try:
-                output = self.model_inst.forward(inputs, instance_comm)
+                output = self.model_inst.forward(inputs)
             except Exception as e:
                 logger.error(f'unhandled exception: {e}')
                 self.que.put((-1, None))
@@ -377,12 +375,10 @@ def _async_forward_callback(self, result, ctx, que: LifoQueue):
         que.put((False, result))
 
     def _async_forward_thread(self, inputs, que: LifoQueue):
-        instance_comm = self.tm_model.model_comm.create_instance_comm(
-            self.gpu_count)
 
         def _func():
             try:
-                output = self.model_inst.forward(inputs, instance_comm)
+                output = self.model_inst.forward(inputs)
             except Exception as e:
                 logger.error(f'unhandled exception: {e}')
                 que.put((-1, None))
diff --git a/src/turbomind/models/llama/LlamaBatch.h b/src/turbomind/models/llama/LlamaBatch.h
index 9c66948999..f952da6bae 100644
--- a/src/turbomind/models/llama/LlamaBatch.h
+++ b/src/turbomind/models/llama/LlamaBatch.h
@@ -12,7 +12,6 @@
 #include "src/turbomind/utils/allocator.h"
 #include "src/turbomind/utils/cublasMMWrapper.h"
 #include "src/turbomind/utils/cuda_utils.h"
-#include "src/turbomind/utils/instance_comm.h"
 #include <condition_variable>
 #include <curand_kernel.h>
 #include <mutex>
@@ -32,8 +31,7 @@ struct SharedState {
 };
 
 struct Control {
-    AbstractInstanceComm* comm;
-    Request::Callback     callback;
+    Request::Callback callback;
 };
 
 struct BatchState {
diff --git a/src/turbomind/models/llama/LlamaV2.h b/src/turbomind/models/llama/LlamaV2.h
index 658282f5e5..a0d35b887f 100644
--- a/src/turbomind/models/llama/LlamaV2.h
+++ b/src/turbomind/models/llama/LlamaV2.h
@@ -21,6 +21,9 @@
 
 #pragma once
 
+#include <limits>
+#include <unordered_map>
+
 #include "src/turbomind/layers/DynamicDecodeLayer.h"
 #include "src/turbomind/models/llama/Barrier.h"
 #include "src/turbomind/models/llama/LlamaBatch.h"
@@ -31,10 +34,7 @@
 #include "src/turbomind/models/llama/unified_decoder.h"
 #include "src/turbomind/utils/allocator.h"
 #include "src/turbomind/utils/cublasMMWrapper.h"
-#include "src/turbomind/utils/instance_comm.h"
 #include "src/turbomind/utils/nccl_utils.h"
-#include <limits>
-#include <unordered_map>
 
 namespace turbomind {
 
diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp
index 5a344d9545..71792a4be8 100644
--- a/src/turbomind/python/bind.cpp
+++ b/src/turbomind/python/bind.cpp
@@ -1,34 +1,38 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 
-#include "src/turbomind/python/dlpack.h"
-#include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
-#include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
-#include "src/turbomind/utils/cuda_utils.h"
-#include "src/turbomind/utils/nccl_utils.h"
-#include <cuda_runtime.h>
 #include <memory>
+#include <stdexcept>
+
+#include <cuda_runtime.h>
+
 #include <pybind11/functional.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/pytypes.h>
 #include <pybind11/stl.h>
 #include <pybind11/stl_bind.h>
-#include <stdexcept>
+
+#include "src/turbomind/python/dlpack.h"
+#include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
+#include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
+#include "src/turbomind/utils/Tensor.h"
+#include "src/turbomind/utils/cuda_utils.h"
+#include "src/turbomind/utils/nccl_utils.h"
 
 namespace py = pybind11;
 namespace ft = turbomind;
 using namespace pybind11::literals;
 
 // prepare to bind container
-using TensorVector = std::vector<triton::Tensor>;
+using TensorVector = std::vector<ft::Tensor>;
 PYBIND11_MAKE_OPAQUE(TensorVector);
-using TensorMap = std::unordered_map<std::string, triton::Tensor>;
+using TensorMap = std::unordered_map<std::string, ft::Tensor>;
 PYBIND11_MAKE_OPAQUE(TensorMap);
 static const char kDlTensorCapsuleName[] = "dltensor";
 
-DLDevice getDLDevice(triton::Tensor& tensor)
+DLDevice getDLDevice(ft::Tensor& tensor)
 {
     int device_id = 0;
-    if (tensor.where == triton::MEMORY_GPU) {
+    if (tensor.where == ft::MEMORY_GPU) {
         cudaPointerAttributes ptr_attr;
         cudaPointerGetAttributes(&ptr_attr, tensor.data);
         device_id = ptr_attr.device;
@@ -37,13 +41,13 @@ DLDevice getDLDevice(triton::Tensor& tensor)
     DLDevice device{kDLCPU, device_id};
 
     switch (tensor.where) {
-        case triton::MEMORY_CPU:
+        case ft::MEMORY_CPU:
             device.device_type = DLDeviceType::kDLCPU;
             break;
-        case triton::MEMORY_CPU_PINNED:
+        case ft::MEMORY_CPU_PINNED:
             device.device_type = DLDeviceType::kDLCUDAHost;
             break;
-        case triton::MEMORY_GPU:
+        case ft::MEMORY_GPU:
             device.device_type = DLDeviceType::kDLCUDA;
             break;
         default:
@@ -53,62 +57,62 @@ DLDevice getDLDevice(triton::Tensor& tensor)
     return device;
 }
 
-DLManagedTensor* TritonTensorToDLManagedTensor(triton::Tensor& tensor)
+DLManagedTensor* TritonTensorToDLManagedTensor(ft::Tensor& tensor)
 {
     DLDevice device = getDLDevice(tensor);
 
     DLDataType data_type{0, 0, 1};
     switch (tensor.type) {
-        case triton::TYPE_BOOL:
+        case ft::TYPE_BOOL:
             data_type.code = DLDataTypeCode::kDLBool;
             data_type.bits = 8;
             break;
-        case triton::TYPE_UINT8:
+        case ft::TYPE_UINT8:
             data_type.code = DLDataTypeCode::kDLUInt;
             data_type.bits = 8;
             break;
-        case triton::TYPE_UINT16:
+        case ft::TYPE_UINT16:
             data_type.code = DLDataTypeCode::kDLUInt;
             data_type.bits = 16;
             break;
-        case triton::TYPE_UINT32:
+        case ft::TYPE_UINT32:
             data_type.code = DLDataTypeCode::kDLUInt;
             data_type.bits = 32;
             break;
-        case triton::TYPE_UINT64:
+        case ft::TYPE_UINT64:
             data_type.code = DLDataTypeCode::kDLUInt;
             data_type.bits = 64;
             break;
-        case triton::TYPE_INT8:
-        case triton::TYPE_BYTES:
+        case ft::TYPE_INT8:
+        case ft::TYPE_BYTES:
             data_type.code = DLDataTypeCode::kDLInt;
             data_type.bits = 8;
             break;
-        case triton::TYPE_INT16:
+        case ft::TYPE_INT16:
             data_type.code = DLDataTypeCode::kDLInt;
             data_type.bits = 16;
             break;
-        case triton::TYPE_INT32:
+        case ft::TYPE_INT32:
             data_type.code = DLDataTypeCode::kDLInt;
             data_type.bits = 32;
             break;
-        case triton::TYPE_INT64:
+        case ft::TYPE_INT64:
             data_type.code = DLDataTypeCode::kDLInt;
             data_type.bits = 64;
             break;
-        case triton::TYPE_FP16:
+        case ft::TYPE_FP16:
             data_type.code = DLDataTypeCode::kDLFloat;
             data_type.bits = 16;
             break;
-        case triton::TYPE_FP32:
+        case ft::TYPE_FP32:
             data_type.code = DLDataTypeCode::kDLFloat;
             data_type.bits = 32;
             break;
-        case triton::TYPE_FP64:
+        case ft::TYPE_FP64:
             data_type.code = DLDataTypeCode::kDLFloat;
             data_type.bits = 64;
             break;
-        case triton::TYPE_BF16:
+        case ft::TYPE_BF16:
             data_type.code = DLDataTypeCode::kDLBfloat;
             data_type.bits = 16;
             break;
@@ -125,78 +129,78 @@ DLManagedTensor* TritonTensorToDLManagedTensor(triton::Tensor& tensor)
     return new DLManagedTensor{dl_tensor, nullptr, [](DLManagedTensor* dlmt) { delete dlmt; }};
 }
 
-triton::MemoryType getMemoryType(DLDevice device)
+ft::MemoryType getMemoryType(DLDevice device)
 {
     switch (device.device_type) {
         case DLDeviceType::kDLCUDAHost:
-            return triton::MemoryType::MEMORY_CPU_PINNED;
+            return ft::MemoryType::MEMORY_CPU_PINNED;
         case DLDeviceType::kDLCUDA:
-            return triton::MemoryType::MEMORY_GPU;
+            return ft::MemoryType::MEMORY_GPU;
         case DLDeviceType::kDLCPU:
         default:
-            return triton::MemoryType::MEMORY_CPU;
+            return ft::MemoryType::MEMORY_CPU;
     }
 }
 
-triton::DataType getDataType(DLDataType data_type)
+ft::DataType getDataType(DLDataType data_type)
 {
     switch (data_type.code) {
         case DLDataTypeCode::kDLUInt:
             switch (data_type.bits) {
                 case 8:
-                    return triton::TYPE_UINT8;
+                    return ft::TYPE_UINT8;
                 case 16:
-                    return triton::TYPE_UINT16;
+                    return ft::TYPE_UINT16;
                 case 32:
-                    return triton::TYPE_UINT32;
+                    return ft::TYPE_UINT32;
                 case 64:
-                    return triton::TYPE_UINT64;
+                    return ft::TYPE_UINT64;
                 default:
-                    return triton::TYPE_INVALID;
+                    return ft::TYPE_INVALID;
             }
             break;
         case DLDataTypeCode::kDLInt:
             switch (data_type.bits) {
                 case 8:
-                    return triton::TYPE_INT8;
+                    return ft::TYPE_INT8;
                 case 16:
-                    return triton::TYPE_INT16;
+                    return ft::TYPE_INT16;
                 case 32:
-                    return triton::TYPE_INT32;
+                    return ft::TYPE_INT32;
                 case 64:
-                    return triton::TYPE_INT64;
+                    return ft::TYPE_INT64;
                 default:
-                    return triton::TYPE_INVALID;
+                    return ft::TYPE_INVALID;
             }
             break;
         case DLDataTypeCode::kDLFloat:
             switch (data_type.bits) {
                 case 16:
-                    return triton::TYPE_FP16;
+                    return ft::TYPE_FP16;
                 case 32:
-                    return triton::TYPE_FP32;
+                    return ft::TYPE_FP32;
                 case 64:
-                    return triton::TYPE_FP64;
+                    return ft::TYPE_FP64;
                 default:
-                    return triton::TYPE_INVALID;
+                    return ft::TYPE_INVALID;
             }
             break;
         case DLDataTypeCode::kDLBfloat:
             switch (data_type.bits) {
                 case 16:
-                    return triton::TYPE_BF16;
+                    return ft::TYPE_BF16;
                 default:
-                    return triton::TYPE_INVALID;
+                    return ft::TYPE_INVALID;
             }
             break;
         case DLDataTypeCode::kDLBool:
-            return triton::TYPE_BOOL;
+            return ft::TYPE_BOOL;
         default:
-            return triton::TYPE_INVALID;
+            return ft::TYPE_INVALID;
     }
 }
 
-std::shared_ptr<triton::Tensor> DLManagedTensorToTritonTensor(DLManagedTensor* tensor)
+std::shared_ptr<ft::Tensor> DLManagedTensorToTritonTensor(DLManagedTensor* tensor)
 {
     auto& dl_tensor = tensor->dl_tensor;
     auto  where     = getMemoryType(dl_tensor.device);
@@ -205,7 +209,7 @@ std::shared_ptr<triton::Tensor> DLManagedTensorToTritonTensor(DLManagedTensor* t
     std::vector<size_t> shape(dl_tensor.shape, dl_tensor.shape + dl_tensor.ndim);
     auto                data = dl_tensor.data;
 
-    return std::make_shared<triton::Tensor>(where, dtype, shape, data);
+    return std::make_shared<ft::Tensor>(where, dtype, shape, data);
 }
 
 DLTensor GetDLTensor(py::object obj)
@@ -270,70 +274,65 @@ PYBIND11_MODULE(_turbomind, m)
     // custom comm
     py::class_<ft::AbstractCustomComm, std::shared_ptr<ft::AbstractCustomComm>>(m, "AbstractCustomComm");
 
-    // instance comm
-    py::class_<ft::AbstractInstanceComm>(m, "AbstractInstanceComm");
-
     // data type
-    py::enum_<triton::DataType>(m, "DataType")
-        .value("TYPE_INVALID", triton::DataType::TYPE_INVALID)
-        .value("TYPE_BOOL", triton::DataType::TYPE_BOOL)
-        .value("TYPE_UINT8", triton::DataType::TYPE_UINT8)
-        .value("TYPE_UINT16", triton::DataType::TYPE_UINT16)
-        .value("TYPE_UINT32", triton::DataType::TYPE_UINT32)
-        .value("TYPE_UINT64", triton::DataType::TYPE_UINT64)
-        .value("TYPE_INT8", triton::DataType::TYPE_INT8)
-        .value("TYPE_INT16", triton::DataType::TYPE_INT16)
-        .value("TYPE_INT32", triton::DataType::TYPE_INT32)
-        .value("TYPE_INT64", triton::DataType::TYPE_INT64)
-        .value("TYPE_FP16", triton::DataType::TYPE_FP16)
-        .value("TYPE_FP32", triton::DataType::TYPE_FP32)
-        .value("TYPE_FP64", triton::DataType::TYPE_FP64)
-        .value("TYPE_BYTES", triton::DataType::TYPE_BYTES)
-        .value("TYPE_BF16", triton::DataType::TYPE_BF16);
+    py::enum_<ft::DataType>(m, "DataType")
+        .value("TYPE_INVALID", ft::DataType::TYPE_INVALID)
+        .value("TYPE_BOOL", ft::DataType::TYPE_BOOL)
+        .value("TYPE_UINT8", ft::DataType::TYPE_UINT8)
+        .value("TYPE_UINT16", ft::DataType::TYPE_UINT16)
+        .value("TYPE_UINT32", ft::DataType::TYPE_UINT32)
+        .value("TYPE_UINT64", ft::DataType::TYPE_UINT64)
+        .value("TYPE_INT8", ft::DataType::TYPE_INT8)
+        .value("TYPE_INT16", ft::DataType::TYPE_INT16)
+        .value("TYPE_INT32", ft::DataType::TYPE_INT32)
+        .value("TYPE_INT64", ft::DataType::TYPE_INT64)
+        .value("TYPE_FP16", ft::DataType::TYPE_FP16)
+        .value("TYPE_FP32", ft::DataType::TYPE_FP32)
+        .value("TYPE_FP64", ft::DataType::TYPE_FP64)
+        .value("TYPE_BYTES", ft::DataType::TYPE_BYTES)
+        .value("TYPE_BF16", ft::DataType::TYPE_BF16);
 
     // memory type
-    py::enum_<triton::MemoryType>(m, "MemoryType")
-        .value("MEMORY_CPU", triton::MemoryType::MEMORY_CPU)
-        .value("MEMORY_CPU_PINNED", triton::MemoryType::MEMORY_CPU_PINNED)
-        .value("MEMORY_GPU", triton::MemoryType::MEMORY_GPU);
+    py::enum_<ft::MemoryType>(m, "MemoryType")
+        .value("MEMORY_CPU", ft::MemoryType::MEMORY_CPU)
+        .value("MEMORY_CPU_PINNED", ft::MemoryType::MEMORY_CPU_PINNED)
+        .value("MEMORY_GPU", ft::MemoryType::MEMORY_GPU);
 
     // tensor
-    py::class_<triton::Tensor, std::shared_ptr<triton::Tensor>>(m, "Tensor")
-        .def_readonly("where", &triton::Tensor::where)
-        .def_readonly("type", &triton::Tensor::type)
-        .def_readonly("shape", &triton::Tensor::shape)
-        .def_readonly("data", &triton::Tensor::data)
-        .def(py::init([](const triton::MemoryType   where,
-                         const triton::DataType     type,
-                         const std::vector<size_t>& shape,
-                         const long                 data) {
-            auto data_ptr = reinterpret_cast<void*>(data);
-            return new triton::Tensor(where, type, shape, data_ptr);
-        }))
+    py::class_<ft::Tensor, std::shared_ptr<ft::Tensor>>(m, "Tensor")
+        .def_readonly("where", &ft::Tensor::where)
+        .def_readonly("type", &ft::Tensor::type)
+        .def_readonly("shape", &ft::Tensor::shape)
+        .def_readonly("data", &ft::Tensor::data)
+        .def(py::init(
+            [](const ft::MemoryType where, const ft::DataType type, const std::vector<size_t>& shape, const long data) {
+                auto data_ptr = reinterpret_cast<void*>(data);
+                return new ft::Tensor(where, type, shape, data_ptr);
+            }))
         .def(
             "view",
-            [](triton::Tensor* self, triton::DataType new_type) {
-                return new triton::Tensor(self->where, new_type, self->shape, self->data);
+            [](ft::Tensor* self, ft::DataType new_type) {
+                return new ft::Tensor(self->where, new_type, self->shape, self->data);
             },
             "new_type"_a)
         .def(
             "view",
-            [](triton::Tensor* self, std::vector<size_t> new_shape) {
-                return new triton::Tensor(self->where, self->type, new_shape, self->data);
+            [](ft::Tensor* self, std::vector<size_t> new_shape) {
+                return new ft::Tensor(self->where, self->type, new_shape, self->data);
             },
             "new_shape"_a)
         .def(
             "copy_from",
-            [](triton::Tensor* self, py::object obj) {
+            [](ft::Tensor* self, py::object obj) {
                 py::capsule      cap = obj.attr("__dlpack__")();
                 DLManagedTensor* dlmt =
                     static_cast<DLManagedTensor*>(PyCapsule_GetPointer(cap.ptr(), kDlTensorCapsuleName));
                 auto src = DLManagedTensorToTritonTensor(dlmt);
                 switch (self->type) {
-                    case triton::TYPE_FP16:
-                    case triton::TYPE_FP32:
-                    case triton::TYPE_INT32:
-                    case triton::TYPE_BF16: {
+                    case ft::TYPE_FP16:
+                    case ft::TYPE_FP32:
+                    case ft::TYPE_INT32:
+                    case ft::TYPE_BF16: {
                         auto num_element =
                             std::accumulate(src->shape.begin(), src->shape.end(), 1LL, std::multiplies<int64_t>());
                         auto num_bytes = num_element * dlmt->dl_tensor.dtype.bits / 8;
@@ -348,7 +347,7 @@ PYBIND11_MODULE(_turbomind, m)
             "tensor"_a)
         .def(
             "__dlpack__",
-            [](triton::Tensor* self, long stream) {
+            [](ft::Tensor* self, long stream) {
                 DLManagedTensor* dlmt = TritonTensorToDLManagedTensor(*self);
                 return py::capsule(dlmt, kDlTensorCapsuleName, [](PyObject* obj) {
                     DLManagedTensor* dlmt =
@@ -364,7 +363,7 @@ PYBIND11_MODULE(_turbomind, m)
                 });
             },
             "stream"_a = 0)
-        .def("__dlpack_device__", [](triton::Tensor* self) {
+        .def("__dlpack_device__", [](ft::Tensor* self) {
             auto device = getDLDevice(*self);
             return std::tuple<int, int>(int(device.device_type), device.device_id);
         });
@@ -380,19 +379,19 @@ PYBIND11_MODULE(_turbomind, m)
         "dl_managed_tensor"_a);
 
     // transformer model instance
+    using ft::AbstractTransformerModelInstance;
     py::bind_map<TensorMap, std::shared_ptr<TensorMap>>(m, "TensorMap");
     py::class_<AbstractTransformerModelInstance>(m, "AbstractTransformerModelInstance")
         .def(
             "forward",
-            [](AbstractTransformerModelInstance* model,
-               std::shared_ptr<TensorMap>        input_tensors,
-               ft::AbstractInstanceComm*         inst_comm) { return model->forward(input_tensors, inst_comm); },
+            [](AbstractTransformerModelInstance* model, std::shared_ptr<TensorMap> input_tensors) {
+                return model->forward(input_tensors);
+            },
             py::call_guard<py::gil_scoped_release>(),
-            "input_tensors"_a,
-            "inst_comm"_a = nullptr)
+            "input_tensors"_a)
         .def(
             "register_callback",
-            [](AbstractTransformerModelInstance* self, triton_stream_cb_t cb, py::object ctx) {
+            [](AbstractTransformerModelInstance* self, ft::triton_stream_cb_t cb, py::object ctx) {
                 self->registerCallback(cb, ctx.ptr());
             },
             "callback"_a,
@@ -400,6 +399,8 @@ PYBIND11_MODULE(_turbomind, m)
         .def("unregister_callback", &AbstractTransformerModelInstance::unRegisterCallback);
 
     // transformer model
+    using ft::AbstractTransformerModel;
+    using ft::LlamaTritonModel;
     py::class_<AbstractTransformerModel, std::shared_ptr<AbstractTransformerModel>>(m, "AbstractTransformerModel")
         .def_static(
             "create_llama_model",
@@ -463,7 +464,6 @@ PYBIND11_MODULE(_turbomind, m)
                 return ret;
             },
             "world_size"_a)
-        .def("create_instance_comm", &AbstractTransformerModel::createInstanceComm, "size"_a)
         .def(
             "create_model_instance",
             [](AbstractTransformerModel*                                         model,
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
index 1c7c5eb468..40c5ac8907 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -27,17 +27,18 @@
 #include "src/turbomind/models/llama/LlamaDenseWeight.h"
 #include "src/turbomind/models/llama/context.h"
 #include "src/turbomind/models/llama/llama_params.h"
+#include "src/turbomind/utils/allocator.h"
+#include "src/turbomind/utils/cuda_utils.h"
+
 #include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
 #include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h"
 #include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
-#include "src/turbomind/utils/allocator.h"
-#include "src/turbomind/utils/cuda_utils.h"
 
-namespace ft = turbomind;
+namespace turbomind {
 
-static std::optional<ft::MoeParam::Method> get_moe_method()
+static std::optional<MoeParam::Method> get_moe_method()
 {
-    static const auto value = []() -> std::optional<ft::MoeParam::Method> {
+    static const auto value = []() -> std::optional<MoeParam::Method> {
         const auto p = std::getenv("TM_MOE_METHOD");
         if (p) {
             std::string str(p);
@@ -45,10 +46,10 @@ static std::optional<ft::MoeParam::Method> get_moe_method()
                 x = std::tolower(x);
             }
             if (str == "naive") {
-                return ft::MoeParam::kNaive;
+                return MoeParam::kNaive;
             }
             else if (str == "fused") {
-                return ft::MoeParam::kFused;
+                return MoeParam::kFused;
             }
             else {
                 std::cerr << "[WARNING] unrecognised MoE method: " << str << "\n";
@@ -67,7 +68,7 @@ std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createLlamaM
     }
     catch (const YAML::Exception& e) {
         std::cerr << "Error reading YAML config: " << e.what() << std::endl;
-        ft::FT_CHECK(false);
+        FT_CHECK(false);
     }
 
     const auto        ft_instance_hyperparameter = reader["ft_instance_hyperparameter"];
@@ -91,7 +92,7 @@ std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createLlamaM
             model_dir);
 #else
         TM_LOG_ERROR("[ERROR] Turbomind is not built with ENABLE_BF16");
-        ft::FT_CHECK(false);
+        FT_CHECK(false);
 #endif
     }
     else {
@@ -103,7 +104,7 @@ std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createLlamaM
             model_dir);
 #else
         TM_LOG_ERROR("[ERROR] Turbomind is not built with ENABLE_BF32");
-        ft::FT_CHECK(false);
+        FT_CHECK(false);
 #endif
     }
     return nullptr;
@@ -205,10 +206,10 @@ void LlamaTritonModel<T>::handleMissingParams()
 template<typename T>
 LlamaTritonModel<T>::~LlamaTritonModel()
 {
-    ft::FT_CHECK(weights_.size() == engines_.size());
+    FT_CHECK(weights_.size() == engines_.size());
     for (int device_id = 0; device_id < (int)engines_.size(); ++device_id) {
         // Set device id before destructing CUDA resources
-        ft::check_cuda_error(cudaSetDevice(device_id));
+        check_cuda_error(cudaSetDevice(device_id));
         engines_[device_id].reset();
         weights_[device_id].reset();
     }
@@ -222,7 +223,7 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
                                       std::string config):
     tensor_para_size_(tensor_para_size),
     pipeline_para_size_(pipeline_para_size),
-    weights_(ft::getDeviceCount()),
+    weights_(getDeviceCount()),
     enable_custom_all_reduce_(enable_custom_all_reduce)
 {
     FT_CHECK_WITH_INFO(!(config.empty() && model_dir.empty()), "invalid init options");
@@ -242,7 +243,7 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
     }
     catch (const YAML::Exception& e) {
         std::cerr << "Error reading YAML config: " << e.what() << std::endl;
-        ft::FT_CHECK(false);
+        FT_CHECK(false);
     }
 
     const auto model_reader     = reader["model_config"];
@@ -305,7 +306,7 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
     engine_param_.num_tokens_per_iter = engine_reader["num_tokens_per_iter"].as<int>(0);
     engine_param_.max_prefill_iters   = engine_reader["max_prefill_iters"].as<int>(1);
 
-    lora_param_.policy        = ft::getLoraPolicy(reader["lora_config"]["lora_policy"].as<std::string>(""));
+    lora_param_.policy        = getLoraPolicy(reader["lora_config"]["lora_policy"].as<std::string>(""));
     lora_param_.r             = lora_reader["lora_r"].as<int>(0);
     lora_param_.scale         = lora_reader["lora_scale"].as<float>(0);
     lora_param_.max_wo_r      = lora_reader["lora_max_wo_r"].as<int>(0);
@@ -329,75 +330,75 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
 
     handleMissingParams();
 
-    shared_state_          = std::make_shared<ft::SharedState>();
-    shared_state_->barrier = std::make_shared<ft::Barrier>(tensor_para_size);
+    shared_state_          = std::make_shared<SharedState>();
+    shared_state_->barrier = std::make_shared<Barrier>(tensor_para_size);
 
-    const auto device_count = ft::getDeviceCount();
+    const auto device_count = getDeviceCount();
     engines_.resize(device_count);
 
     const std::string weight_type_str = model_reader["weight_type"].as<std::string>();
     if (weight_type_str == "fp16" || weight_type_str == "float16") {
-        model_param_.weight_type = ft::WeightType::kFP16;
+        model_param_.weight_type = WeightType::kFP16;
     }
     else if (weight_type_str == "bf16" || weight_type_str == "bfloat16") {
-        model_param_.weight_type = ft::WeightType::kBF16;
+        model_param_.weight_type = WeightType::kBF16;
     }
     else if (weight_type_str == "fp32") {
-        model_param_.weight_type = ft::WeightType::kFP32;
+        model_param_.weight_type = WeightType::kFP32;
     }
     else if (weight_type_str == "int8") {
-        model_param_.weight_type = ft::WeightType::kINT8;
+        model_param_.weight_type = WeightType::kINT8;
     }
     else if (weight_type_str == "int4") {
-        model_param_.weight_type = ft::WeightType::kINT4;
+        model_param_.weight_type = WeightType::kINT4;
     }
     else {
         std::cout << "[ERROR] Unsupported weight type: '" << weight_type_str << "'\n";
-        ft::FT_CHECK(0);
+        FT_CHECK(0);
     }
 
     if (auto method = get_moe_method()) {
         moe_param_.method = *method;
     }
     else {
-        moe_param_.method = ft::MoeParam::kFused;
+        moe_param_.method = MoeParam::kFused;
     }
 
     TM_LOG_INFO("%s", toString().c_str());
 }
 
 template<typename T>
-std::unique_ptr<ft::Engine<T>> LlamaTritonModel<T>::createSharedModelInstance(
-    int                                                               device_id,
-    int                                                               rank,
-    std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
-    std::shared_ptr<ft::AbstractCustomComm>                           custom_all_reduce_comm)
+std::unique_ptr<Engine<T>>
+LlamaTritonModel<T>::createSharedModelInstance(int                                                       device_id,
+                                               int                                                       rank,
+                                               std::pair<std::vector<NcclParam>, std::vector<NcclParam>> nccl_params,
+                                               std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm)
 {
-    ft::check_cuda_error(cudaSetDevice(device_id));
+    check_cuda_error(cudaSetDevice(device_id));
     const int comms_rank = device_id % (tensor_para_size_ * pipeline_para_size_);
 
-    auto ctx = std::make_unique<ft::Context<T>>(device_id);
+    auto ctx = std::make_unique<Context<T>>(device_id);
 
-    ft::NcclParam tensor_para   = nccl_params.first[comms_rank];
-    ft::NcclParam pipeline_para = nccl_params.second[comms_rank];
+    NcclParam tensor_para   = nccl_params.first[comms_rank];
+    NcclParam pipeline_para = nccl_params.second[comms_rank];
 
-    ft::FT_CHECK(tensor_para.world_size_ == tensor_para_size_);
-    ft::FT_CHECK(pipeline_para.world_size_ == pipeline_para_size_);
+    FT_CHECK(tensor_para.world_size_ == tensor_para_size_);
+    FT_CHECK(pipeline_para.world_size_ == pipeline_para_size_);
 
-    auto model = std::make_unique<ft::LlamaV2<T>>(model_param_,  //
-                                                  attn_param_,
-                                                  moe_param_,
-                                                  lora_param_,
-                                                  tensor_para,
-                                                  *ctx,
-                                                  engine_param_.max_batch_size,
-                                                  weights_[device_id]);
+    auto model = std::make_unique<LlamaV2<T>>(model_param_,  //
+                                              attn_param_,
+                                              moe_param_,
+                                              lora_param_,
+                                              tensor_para,
+                                              *ctx,
+                                              engine_param_.max_batch_size,
+                                              weights_[device_id]);
 
-    auto engine = std::make_unique<ft::Engine<T>>(engine_param_,  //
-                                                  std::move(model),
-                                                  std::move(ctx),
-                                                  shared_state_,
-                                                  device_id);
+    auto engine = std::make_unique<Engine<T>>(engine_param_,  //
+                                              std::move(model),
+                                              std::move(ctx),
+                                              shared_state_,
+                                              device_id);
 
     // Wait for pinned buffers to be allocated for all ranks, otherwise tuning will hang
     // due to concurrent kernel launch & cudaMallocHost
@@ -413,14 +414,14 @@ std::unique_ptr<AbstractTransformerModelInstance>
 LlamaTritonModel<T>::createModelInstance(int          device_id,
                                          int          rank,
                                          cudaStream_t stream,
-                                         std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>>,
-                                         std::shared_ptr<ft::AbstractCustomComm>)
+                                         std::pair<std::vector<NcclParam>, std::vector<NcclParam>>,
+                                         std::shared_ptr<AbstractCustomComm>)
 {
-    ft::check_cuda_error(cudaSetDevice(device_id));
+    check_cuda_error(cudaSetDevice(device_id));
 
-    ft::FT_CHECK(engines_[device_id] != nullptr);
+    FT_CHECK(engines_[device_id] != nullptr);
 
-    auto allocator = std::make_unique<ft::Allocator<ft::AllocatorType::CUDA>>(device_id, false);
+    auto allocator = std::make_unique<Allocator<AllocatorType::CUDA>>(device_id, false);
 
     allocator->setStream(stream);
 
@@ -430,12 +431,12 @@ LlamaTritonModel<T>::createModelInstance(int          device_id,
 template<typename T>
 void LlamaTritonModel<T>::createSharedWeights(int device_id, int rank)
 {
-    ft::check_cuda_error(cudaSetDevice(device_id));
+    check_cuda_error(cudaSetDevice(device_id));
     const int tensor_para_rank   = rank % tensor_para_size_;
     const int pipeline_para_rank = rank / tensor_para_size_;
-    ft::FT_CHECK(pipeline_para_size_ == 1 && pipeline_para_rank == 0);
-    weights_[device_id] = std::make_shared<ft::LlamaWeight<T>>(
-        model_param_, lora_param_, moe_param_, tensor_para_size_, tensor_para_rank);
+    FT_CHECK(pipeline_para_size_ == 1 && pipeline_para_rank == 0);
+    weights_[device_id] =
+        std::make_shared<LlamaWeight<T>>(model_param_, lora_param_, moe_param_, tensor_para_size_, tensor_para_rank);
     // model inited with model_dir
     if (model_dir_ != "") {
         weights_[device_id]->loadModel(model_dir_);
@@ -444,37 +445,41 @@ void LlamaTritonModel<T>::createSharedWeights(int device_id, int rank)
 }
 
 template<typename T>
-TensorMap LlamaTritonModel<T>::getParams(int deviceId, int rank)
+std::unordered_map<std::string, Tensor> LlamaTritonModel<T>::getParams(int deviceId, int rank)
 {
-    ft::check_cuda_error(cudaSetDevice(deviceId));
+    check_cuda_error(cudaSetDevice(deviceId));
+
     // shared_weight should be created before getParams
-    ft::FT_CHECK(weights_[deviceId] != nullptr);
-    ft::TensorMap output = weights_[deviceId]->getParams();
-    TensorMap     result;
+    FT_CHECK(weights_[deviceId] != nullptr);
+
+    TensorMap output = weights_[deviceId]->getParams();
+
+    std::unordered_map<std::string, Tensor> result;
     for (auto [name, tensor] : output) {
-        result.emplace(name, triton::Tensor{tensor.where, tensor.type, tensor.shape, tensor.data});
+        result.insert({{name, Tensor{tensor.where, tensor.type, tensor.shape, tensor.data}}});
     }
+
     return result;
 }
 
 template<typename T>
 void LlamaTritonModel<T>::processWeights(int device_id, int rank)
 {
-    ft::check_cuda_error(cudaSetDevice(device_id));
-    ft::FT_CHECK(weights_[device_id] != nullptr);
+    check_cuda_error(cudaSetDevice(device_id));
+    FT_CHECK(weights_[device_id] != nullptr);
 
     cudaDeviceProp props{};
-    ft::check_cuda_error(cudaGetDeviceProperties(&props, device_id));
+    check_cuda_error(cudaGetDeviceProperties(&props, device_id));
 
     weights_[device_id]->prepare(props);
-    ft::sync_check_cuda_error();
+    sync_check_cuda_error();
 }
 
 template<typename T>
-void LlamaTritonModel<T>::createEngine(int                                                               device_id,
-                                       int                                                               rank,
-                                       std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
-                                       std::shared_ptr<ft::AbstractCustomComm> custom_all_reduce_comm)
+void LlamaTritonModel<T>::createEngine(int                                                       device_id,
+                                       int                                                       rank,
+                                       std::pair<std::vector<NcclParam>, std::vector<NcclParam>> nccl_params,
+                                       std::shared_ptr<AbstractCustomComm>                       custom_all_reduce_comm)
 {
 
     auto engine = createSharedModelInstance(device_id, rank, nccl_params, custom_all_reduce_comm);
@@ -515,17 +520,11 @@ std::string LlamaTritonModel<T>::toString()
 }
 
 template<typename T>
-void LlamaTritonModel<T>::createCustomComms(
-    std::vector<std::shared_ptr<ft::AbstractCustomComm>>* custom_all_reduce_comms, int world_size)
+void LlamaTritonModel<T>::createCustomComms(std::vector<std::shared_ptr<AbstractCustomComm>>* custom_all_reduce_comms,
+                                            int                                               world_size)
 {
-    using commDataType = typename ft::CustomARCommTypeConverter<T>::Type;
-    ft::initCustomAllReduceComm<commDataType>(custom_all_reduce_comms, enable_custom_all_reduce_, world_size);
-}
-
-template<typename T>
-std::unique_ptr<ft::AbstractInstanceComm> LlamaTritonModel<T>::createInstanceComm(int size)
-{
-    return nullptr;
+    using commDataType = typename CustomARCommTypeConverter<T>::Type;
+    initCustomAllReduceComm<commDataType>(custom_all_reduce_comms, enable_custom_all_reduce_, world_size);
 }
 
 template<typename T>
@@ -547,3 +546,5 @@ template struct LlamaTritonModel<half>;
 #ifdef ENABLE_BF16
 template struct LlamaTritonModel<__nv_bfloat16>;
 #endif
+
+}  // namespace turbomind
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.h b/src/turbomind/triton_backend/llama/LlamaTritonModel.h
index a6c1b862ac..8f473cd4cd 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.h
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.h
@@ -31,7 +31,7 @@
 #include <cuda_fp16.h>
 #include <mutex>
 
-namespace ft = turbomind;
+namespace turbomind {
 
 template<typename T>
 struct LlamaTritonModel: public AbstractTransformerModel {
@@ -44,27 +44,25 @@ struct LlamaTritonModel: public AbstractTransformerModel {
     ~LlamaTritonModel() override;
 
     std::unique_ptr<AbstractTransformerModelInstance>
-    createModelInstance(int                                                               deviceId,
-                        int                                                               rank,
-                        cudaStream_t                                                      stream,
-                        std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
-                        std::shared_ptr<ft::AbstractCustomComm> custom_all_reduce_comm = nullptr) override;
+    createModelInstance(int                                                       deviceId,
+                        int                                                       rank,
+                        cudaStream_t                                              stream,
+                        std::pair<std::vector<NcclParam>, std::vector<NcclParam>> nccl_params,
+                        std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm = nullptr) override;
 
     void createSharedWeights(int deviceId, int rank) override;
 
-    TensorMap getParams(int deviceId, int rank) override;
+    std::unordered_map<std::string, Tensor> getParams(int deviceId, int rank) override;
 
     void processWeights(int deviceId, int rank) override;
 
-    void createEngine(int                                                               device_id,
-                      int                                                               rank,
-                      std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
-                      std::shared_ptr<ft::AbstractCustomComm>) override;
+    void createEngine(int                                                       device_id,
+                      int                                                       rank,
+                      std::pair<std::vector<NcclParam>, std::vector<NcclParam>> nccl_params,
+                      std::shared_ptr<AbstractCustomComm>) override;
 
-    void createCustomComms(std::vector<std::shared_ptr<ft::AbstractCustomComm>>* custom_all_reduce_comms,
-                           int                                                   world_size) override;
-
-    std::unique_ptr<ft::AbstractInstanceComm> createInstanceComm(int size) override;
+    void createCustomComms(std::vector<std::shared_ptr<AbstractCustomComm>>* custom_all_reduce_comms,
+                           int                                               world_size) override;
 
     void handleMissingParams();
 
@@ -78,24 +76,24 @@ struct LlamaTritonModel: public AbstractTransformerModel {
     int         getPipelineParaSize() override;
 
 private:
-    std::unique_ptr<ft::Engine<T>>
-    createSharedModelInstance(int                                                               deviceId,
-                              int                                                               rank,
-                              std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
-                              std::shared_ptr<ft::AbstractCustomComm> custom_all_reduce_comm = nullptr);
-
-    ft::ModelParam     model_param_;
-    ft::AttentionParam attn_param_;
-    ft::MoeParam       moe_param_;
-    ft::LoraParam      lora_param_;
-    ft::EngineParam    engine_param_;
-    size_t             tensor_para_size_;
-    size_t             pipeline_para_size_;
-
-    std::shared_ptr<ft::SharedState> shared_state_;
+    std::unique_ptr<Engine<T>>
+    createSharedModelInstance(int                                                       deviceId,
+                              int                                                       rank,
+                              std::pair<std::vector<NcclParam>, std::vector<NcclParam>> nccl_params,
+                              std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm = nullptr);
+
+    ModelParam     model_param_;
+    AttentionParam attn_param_;
+    MoeParam       moe_param_;
+    LoraParam      lora_param_;
+    EngineParam    engine_param_;
+    size_t         tensor_para_size_;
+    size_t         pipeline_para_size_;
+
+    std::shared_ptr<SharedState> shared_state_;
     // Weights & engine instances for the ranks
-    std::vector<std::shared_ptr<ft::LlamaWeight<T>>> weights_;
-    std::vector<std::shared_ptr<ft::Engine<T>>>      engines_;
+    std::vector<std::shared_ptr<LlamaWeight<T>>> weights_;
+    std::vector<std::shared_ptr<Engine<T>>>      engines_;
 
     bool is_fp16_;
     int  enable_custom_all_reduce_ = 0;
@@ -105,3 +103,5 @@ struct LlamaTritonModel: public AbstractTransformerModel {
 
     ffi_api_lock_ctrl_t ffi_lock_ = nullptr;
 };
+
+}  // namespace turbomind
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc
index 8221f932ce..976fc9cc1d 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc
@@ -31,78 +31,23 @@
 #include <unordered_map>
 #include <vector>
 
-namespace ft = turbomind;
+namespace turbomind {
 
 template<typename T>
-void triton_stream_callback(std::unordered_map<std::string, ft::Tensor>* output_tensors, void* ctx)
+void triton_stream_callback(std::unordered_map<std::string, Tensor>* outputs, void* ctx)
 {
-    LlamaTritonModelInstance<T>* model  = reinterpret_cast<LlamaTritonModelInstance<T>*>(ctx);
-    auto                         result = LlamaTritonModelInstance<T>::convert_outputs(*output_tensors);
-
-    model->stream_cb_(result, model->stream_ctx_);
+    LlamaTritonModelInstance<T>* model = reinterpret_cast<LlamaTritonModelInstance<T>*>(ctx);
+    model->stream_cb_(std::make_shared<std::unordered_map<std::string, Tensor>>(*outputs), model->stream_ctx_);
 }
 
 template<typename T>
-LlamaTritonModelInstance<T>::LlamaTritonModelInstance(ft::Engine<T>&                                          instance,
-                                                      std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator,
-                                                      int device_id):
+LlamaTritonModelInstance<T>::LlamaTritonModelInstance(Engine<T>&                                      instance,
+                                                      std::unique_ptr<Allocator<AllocatorType::CUDA>> allocator,
+                                                      int                                             device_id):
     device_id_{device_id}, instance_(&instance), allocator_(std::move(allocator))
 {
 }
 
-template<typename T>
-std::unordered_map<std::string, ft::Tensor> LlamaTritonModelInstance<T>::convert_inputs(
-    std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors)
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-
-    const size_t request_batch_size = input_tensors->at("input_ids").shape[0];
-    const size_t input_data_len     = input_tensors->at("input_ids").shape[1];
-    h_total_output_lengths_ =
-        (uint32_t*)std::realloc((void*)h_total_output_lengths_, request_batch_size * sizeof(uint32_t));
-
-    std::unordered_map<std::string, ft::Tensor> ft_input_tensors{};
-
-    for (auto t = input_tensors->begin(); t != input_tensors->end(); ++t) {
-        if (ft_input_tensors.count(t->first) == 0) {
-            ft_input_tensors.insert({t->first, t->second.convertTritonTensorToFt()});
-        }
-    }
-
-    return ft_input_tensors;
-}
-
-template<typename T>
-std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
-LlamaTritonModelInstance<T>::convert_outputs(const std::unordered_map<std::string, ft::Tensor>& output_tensors)
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    std::unordered_map<std::string, triton::Tensor>* outputs_mapping =
-        new std::unordered_map<std::string, triton::Tensor>();
-
-    for (auto it = output_tensors.begin(); it != output_tensors.end(); it++) {
-        outputs_mapping->insert({it->first, triton::Tensor::convertFtTensorToTriton(it->second)});
-    }
-
-    return std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>(outputs_mapping);
-}
-
-template<typename T>
-std::shared_ptr<std::vector<triton::Tensor>>
-LlamaTritonModelInstance<T>::forward(std::shared_ptr<std::vector<triton::Tensor>> input_tensors)
-{
-    ft::FT_CHECK(false);
-    return nullptr;
-}
-
-template<typename T>
-std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
-LlamaTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors)
-{
-    ft::FT_CHECK(false);
-    return nullptr;
-}
-
 template<typename T>
 std::string format_vector(const std::vector<T>& vec)
 {
@@ -118,120 +63,109 @@ std::string format_vector(const std::vector<T>& vec)
 }
 
 template<typename T>
-std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
-LlamaTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors,
-                                     ft::AbstractInstanceComm*                                        instance_comm)
+std::shared_ptr<std::unordered_map<std::string, Tensor>>
+LlamaTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::string, Tensor>> inputs)
 {
     TM_LOG_DEBUG(__PRETTY_FUNCTION__);
 
     // In some cases, this is needed to trigger the creation of CUDA context, or later `cudaMallocAsync` will die
-    ft::check_cuda_error(cudaSetDevice(device_id_));
+    check_cuda_error(cudaSetDevice(device_id_));
 
-    FT_CHECK_WITH_INFO(input_tensors->at("input_ids").shape.size() == 2,
-                       "input_tensors->at(\"input_ids\").shape.size() == 2");
-    FT_CHECK_WITH_INFO(input_tensors->at("input_lengths").shape.size() == 1,
-                       "input_tensors->at(\"input_lengths\").shape.size() == 1");
+    FT_CHECK_WITH_INFO(inputs->at("input_ids").shape.size() == 2, "inputs->at(\"input_ids\").shape.size() == 2");
+    FT_CHECK_WITH_INFO(inputs->at("input_lengths").shape.size() == 1,
+                       "inputs->at(\"input_lengths\").shape.size() == 1");
 
-    const uint32_t request_batch_size     = input_tensors->at("input_ids").shape[0];
-    const uint32_t max_request_output_len = (size_t)*std::max_element(
-        (int*)input_tensors->at("request_output_len").data,
-        (int*)input_tensors->at("request_output_len").data + input_tensors->at("request_output_len").shape[0]);
+    const uint32_t request_batch_size     = inputs->at("input_ids").shape[0];
+    const uint32_t max_request_output_len = (size_t)*std::max_element((int*)inputs->at("request_output_len").data,
+                                                                      (int*)inputs->at("request_output_len").data
+                                                                          + inputs->at("request_output_len").shape[0]);
     // const uint32_t total_output_len = max_request_output_len + input_tensors->at("input_ids").shape[1];
-    const uint32_t beam_width =
-        input_tensors->count("beam_width") ? (size_t)(*(uint*)input_tensors->at("beam_width").data) : 1;
+    const uint32_t beam_width = inputs->count("beam_width") ? (size_t)(*(uint*)inputs->at("beam_width").data) : 1;
     FT_CHECK_WITH_INFO(beam_width == 1, "Beam search is not implemented");
 
-    std::unordered_map<std::string, ft::Tensor> ft_input_tensors = convert_inputs(input_tensors);
+    h_total_output_lengths_ =
+        (uint32_t*)std::realloc((void*)h_total_output_lengths_, request_batch_size * sizeof(uint32_t));
 
-    const size_t max_input_len = input_tensors->at("input_ids").shape[1];
-    const bool   is_return_logits =
-        input_tensors->count("is_return_logits") && *(bool*)input_tensors->at("is_return_logits").data;
+    const size_t max_input_len    = inputs->at("input_ids").shape[1];
+    const bool   is_return_logits = inputs->count("is_return_logits") && *(bool*)inputs->at("is_return_logits").data;
 
     const size_t vocab_size = instance_->model().vocab_size();
 
     allocateBuffer(request_batch_size, max_input_len, beam_width, instance_->session_len(), is_return_logits);
 
-    std::unordered_map<std::string, ft::Tensor> output_tensors = std::unordered_map<std::string, ft::Tensor>{
+    std::unordered_map<std::string, Tensor> outputs{
         {"output_ids",
-         ft::Tensor{ft::MEMORY_CPU,
-                    ft::TYPE_UINT32,
-                    std::vector<size_t>{request_batch_size, beam_width, (size_t)instance_->session_len()},
-                    d_output_ids_}},
+         Tensor{MEMORY_CPU,
+                TYPE_UINT32,
+                std::vector<size_t>{request_batch_size, beam_width, (size_t)instance_->session_len()},
+                d_output_ids_}},
         {"sequence_length",
-         ft::Tensor{ft::MEMORY_CPU,
-                    ft::TYPE_UINT32,
-                    std::vector<size_t>{request_batch_size, beam_width},
-                    d_sequence_lengths_}}};
-
-    if (input_tensors->count("is_return_log_probs") && *((bool*)input_tensors->at("is_return_log_probs").data)) {
-        output_tensors.insert({"output_log_probs",
-                               ft::Tensor{ft::MEMORY_GPU,
-                                          ft::TYPE_FP32,
-                                          std::vector<size_t>{request_batch_size, beam_width, max_request_output_len},
-                                          d_output_log_probs_}});
-        output_tensors.insert({"cum_log_probs",
-                               ft::Tensor{ft::MEMORY_GPU,
-                                          ft::TYPE_FP32,
-                                          std::vector<size_t>{request_batch_size, beam_width},
-                                          d_cum_log_probs_}});
+         Tensor{MEMORY_CPU, TYPE_UINT32, std::vector<size_t>{request_batch_size, beam_width}, d_sequence_lengths_}}};
+
+    if (inputs->count("is_return_log_probs") && *((bool*)inputs->at("is_return_log_probs").data)) {
+        outputs.insert({"output_log_probs",
+                        Tensor{MEMORY_GPU,
+                               TYPE_FP32,
+                               std::vector<size_t>{request_batch_size, beam_width, max_request_output_len},
+                               d_output_log_probs_}});
+        outputs.insert(
+            {"cum_log_probs",
+             Tensor{MEMORY_GPU, TYPE_FP32, std::vector<size_t>{request_batch_size, beam_width}, d_cum_log_probs_}});
     }
 
-    if (input_tensors->count("logprobs")) {
+    if (inputs->count("logprobs")) {
         size_t max_logprob_length = std::min((int)max_request_output_len, instance_->session_len()) + 1;
         h_logprob_vals_           = (float*)std::realloc(
-            h_logprob_vals_, sizeof(float) * request_batch_size * beam_width * max_logprob_length * ft::kMaxLogProb);
-        h_logprob_indexes_ = (uint32_t*)std::realloc(h_logprob_indexes_,
-                                                     sizeof(uint32_t) * request_batch_size * beam_width
-                                                         * max_logprob_length * ft::kMaxLogProb);
-        h_logprob_nums_    = (uint32_t*)std::realloc(
+            h_logprob_vals_, sizeof(float) * request_batch_size * beam_width * max_logprob_length * kMaxLogProb);
+        h_logprob_indexes_ = (uint32_t*)std::realloc(
+            h_logprob_indexes_, sizeof(uint32_t) * request_batch_size * beam_width * max_logprob_length * kMaxLogProb);
+        h_logprob_nums_ = (uint32_t*)std::realloc(
             h_logprob_nums_, sizeof(uint32_t) * request_batch_size * beam_width * max_logprob_length);
 
-        output_tensors.insert(
-            {{"logprob_vals",
-              ft::Tensor{ft::MEMORY_CPU,
-                         ft::TYPE_FP32,
-                         std::vector<size_t>{request_batch_size, beam_width, max_logprob_length, ft::kMaxLogProb},
-                         h_logprob_vals_}}});
-
-        output_tensors.insert(
-            {{"logprob_indexes",
-              ft::Tensor{ft::MEMORY_CPU,
-                         ft::TYPE_UINT32,
-                         std::vector<size_t>{request_batch_size, beam_width, max_logprob_length, ft::kMaxLogProb},
-                         h_logprob_indexes_}}});
-
-        output_tensors.insert({{"logprob_nums",
-                                ft::Tensor{ft::MEMORY_CPU,
-                                           ft::TYPE_UINT32,
-                                           std::vector<size_t>{request_batch_size, beam_width, max_logprob_length},
-                                           h_logprob_nums_}}});
+        outputs.insert({{"logprob_vals",
+                         Tensor{MEMORY_CPU,
+                                TYPE_FP32,
+                                std::vector<size_t>{request_batch_size, beam_width, max_logprob_length, kMaxLogProb},
+                                h_logprob_vals_}}});
+
+        outputs.insert({{"logprob_indexes",
+                         Tensor{MEMORY_CPU,
+                                TYPE_UINT32,
+                                std::vector<size_t>{request_batch_size, beam_width, max_logprob_length, kMaxLogProb},
+                                h_logprob_indexes_}}});
+
+        outputs.insert({{"logprob_nums",
+                         Tensor{MEMORY_CPU,
+                                TYPE_UINT32,
+                                std::vector<size_t>{request_batch_size, beam_width, max_logprob_length},
+                                h_logprob_nums_}}});
     }
 
     if (is_return_logits) {
-        output_tensors.insert(
-            {"logits",
-             {ft::MEMORY_GPU, ft::TYPE_FP32, {request_batch_size, max_input_len, vocab_size}, d_output_logits_}});
+        outputs.insert(
+            {{"logits", {MEMORY_GPU, TYPE_FP32, {request_batch_size, max_input_len, vocab_size}, d_output_logits_}}});
     }
 
     try {
-        ft::Request::Callback callback;
+        Request::Callback callback;
 
         if (stream_cb_) {
-            callback = [this](std::unordered_map<std::string, ft::Tensor>* outputs) {
+            callback = [this](std::unordered_map<std::string, Tensor>* outputs) {
                 triton_stream_callback<T>(outputs, this);
             };
         }
 
-        ft::check_cuda_error(cudaStreamSynchronize(allocator_->returnStream()));
-        instance_->Submit(&output_tensors, &ft_input_tensors, {instance_comm, callback});
+        check_cuda_error(cudaStreamSynchronize(allocator_->returnStream()));
+
+        instance_->Submit(&outputs, inputs.get(), {callback});
         // ! stream synced by the model before returning
     }
     catch (...) {
         h_exception_ = std::current_exception();
-        output_tensors.insert({"error_message", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_BYTES, {1}, &h_exception_}});
+        outputs.insert({"error_message", Tensor{MEMORY_CPU, TYPE_BYTES, {1}, &h_exception_}});
     }
 
-    return convert_outputs(output_tensors);
+    return std::make_shared<std::unordered_map<std::string, Tensor>>(std::move(outputs));
 }
 
 template<typename T>
@@ -278,3 +212,5 @@ template struct LlamaTritonModelInstance<half>;
 #ifdef ENABLE_BF16
 template struct LlamaTritonModelInstance<__nv_bfloat16>;
 #endif
+
+}  // namespace turbomind
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h
index 08088c05d5..2cf69b9fa5 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h
@@ -20,41 +20,29 @@
 
 #pragma once
 
+#include <memory>
+
 #include "src/turbomind/models/llama/LlamaBatch.h"
 #include "src/turbomind/models/llama/LlamaV2.h"
 #include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
 #include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
-#include <memory>
 
-namespace ft = turbomind;
+namespace turbomind {
 
 template<typename T>
 struct LlamaTritonModelInstance: AbstractTransformerModelInstance {
 
-    LlamaTritonModelInstance(ft::Engine<T>&                                          instance,
-                             std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator,
-                             int                                                     device_id);
-    ~LlamaTritonModelInstance();
-
-    std::shared_ptr<std::vector<triton::Tensor>>
-    forward(std::shared_ptr<std::vector<triton::Tensor>> input_tensors) override;
+    LlamaTritonModelInstance(Engine<T>&                                      instance,
+                             std::unique_ptr<Allocator<AllocatorType::CUDA>> allocator,
+                             int                                             device_id);
+    ~LlamaTritonModelInstance() override;
 
-    std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
-    forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors) override;
-
-    std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
-    forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors,
-            ft::AbstractInstanceComm*) override;
-
-    static std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
-    convert_outputs(const std::unordered_map<std::string, ft::Tensor>& output_tensors);
+    virtual std::shared_ptr<std::unordered_map<std::string, Tensor>>
+    forward(std::shared_ptr<std::unordered_map<std::string, Tensor>> input_tensors) override;
 
 private:
-    ft::Engine<T>*                                                instance_;
-    const std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator_;
-
-    std::unordered_map<std::string, ft::Tensor>
-    convert_inputs(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors);
+    Engine<T>*                                            instance_;
+    const std::unique_ptr<Allocator<AllocatorType::CUDA>> allocator_;
 
     void allocateBuffer(const size_t request_batch_size,
                         const size_t max_input_len,
@@ -88,3 +76,5 @@ struct LlamaTritonModelInstance: AbstractTransformerModelInstance {
     uint32_t*          h_total_output_lengths_ = nullptr;
     std::exception_ptr h_exception_            = nullptr;
 };
+
+}  // namespace turbomind
diff --git a/src/turbomind/triton_backend/transformer_triton_backend.cpp b/src/turbomind/triton_backend/transformer_triton_backend.cpp
index 16c64b17d5..acf5e06e88 100644
--- a/src/turbomind/triton_backend/transformer_triton_backend.cpp
+++ b/src/turbomind/triton_backend/transformer_triton_backend.cpp
@@ -21,62 +21,66 @@
 #include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
 #include "src/turbomind/utils/nccl_utils.h"
 
-std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>>
+namespace turbomind {
+
+std::pair<std::vector<NcclParam>, std::vector<NcclParam>>
 AbstractTransformerModel::createNcclParams(const int node_id, const int device_id_start, const bool multi_node)
 {
-    const int gpu_count          = ft::getDeviceCount();
+    const int gpu_count          = getDeviceCount();
     const int tensor_para_size   = getTensorParaSize();
     const int pipeline_para_size = getPipelineParaSize();
     const int local_comm_size    = multi_node ? gpu_count : tensor_para_size * pipeline_para_size;
-    ft::FT_CHECK(tensor_para_size > 0 && pipeline_para_size > 0);
-    ft::FT_CHECK(device_id_start + (int)local_comm_size <= gpu_count);
+    FT_CHECK(tensor_para_size > 0 && pipeline_para_size > 0);
+    FT_CHECK(device_id_start + (int)local_comm_size <= gpu_count);
 
-    std::vector<ft::NcclUid> nccl_ids;
+    std::vector<NcclUid> nccl_ids;
     if (tensor_para_size > 1 || pipeline_para_size > 1) {
         nccl_ids.resize(tensor_para_size + pipeline_para_size);
         if (node_id == 0) {
             for (uint32_t i = 0; i < nccl_ids.size(); i++) {
-                ft::ftNcclGetUniqueId(nccl_ids[i]);
+                ftNcclGetUniqueId(nccl_ids[i]);
             }
         }
     }
 
-    std::vector<ft::NcclParam> tensor_para_params(local_comm_size);
-    std::vector<ft::NcclParam> pipeline_para_params(local_comm_size);
+    std::vector<NcclParam> tensor_para_params(local_comm_size);
+    std::vector<NcclParam> pipeline_para_params(local_comm_size);
     // Don't init comm when size == 1
     if (tensor_para_size > 1) {
-        const auto group_id = ft::ftNcclNextGroupId();
-        ft::ftNcclGroupStart();
+        const auto group_id = ftNcclNextGroupId();
+        ftNcclGroupStart();
         for (int gid = device_id_start; gid < device_id_start + local_comm_size; gid++) {
             int rank               = node_id * gpu_count + gid - device_id_start;
             int tensor_para_rank   = rank % tensor_para_size;
             int pipeline_para_rank = rank / tensor_para_size;
 
-            ft::NcclUid tensor_para_nccl_uid = nccl_ids[pipeline_para_rank];
-            ft::check_cuda_error(cudaSetDevice(gid));
-            ft::ftNcclCommInitRank(
+            NcclUid tensor_para_nccl_uid = nccl_ids[pipeline_para_rank];
+            check_cuda_error(cudaSetDevice(gid));
+            ftNcclCommInitRank(
                 tensor_para_params[gid - device_id_start], tensor_para_rank, tensor_para_size, tensor_para_nccl_uid);
             tensor_para_params[gid - device_id_start].group_id_ = group_id;
         }
-        ft::ftNcclGroupEnd();
+        ftNcclGroupEnd();
     }
     if (pipeline_para_size > 1) {
-        const auto group_id = ft::ftNcclNextGroupId();
-        ft::ftNcclGroupStart();
+        const auto group_id = ftNcclNextGroupId();
+        ftNcclGroupStart();
         for (int gid = device_id_start; gid < device_id_start + local_comm_size; gid++) {
             int rank               = node_id * gpu_count + gid - device_id_start;
             int tensor_para_rank   = rank % tensor_para_size;
             int pipeline_para_rank = rank / tensor_para_size;
 
-            ft::NcclUid pipeline_para_nccl_uid = nccl_ids[pipeline_para_size + tensor_para_rank];
-            ft::check_cuda_error(cudaSetDevice(gid));
-            ft::ftNcclCommInitRank(pipeline_para_params[gid - device_id_start],
-                                   pipeline_para_rank,
-                                   pipeline_para_size,
-                                   pipeline_para_nccl_uid);
+            NcclUid pipeline_para_nccl_uid = nccl_ids[pipeline_para_size + tensor_para_rank];
+            check_cuda_error(cudaSetDevice(gid));
+            ftNcclCommInitRank(pipeline_para_params[gid - device_id_start],
+                               pipeline_para_rank,
+                               pipeline_para_size,
+                               pipeline_para_nccl_uid);
             pipeline_para_params[gid - device_id_start].group_id_ = group_id;
         }
-        ft::ftNcclGroupEnd();
+        ftNcclGroupEnd();
     }
-    return std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>>(tensor_para_params, pipeline_para_params);
+    return std::pair<std::vector<NcclParam>, std::vector<NcclParam>>(tensor_para_params, pipeline_para_params);
 }
+
+}  // namespace turbomind
diff --git a/src/turbomind/triton_backend/transformer_triton_backend.hpp b/src/turbomind/triton_backend/transformer_triton_backend.hpp
index 066d75a780..6d49df4578 100644
--- a/src/turbomind/triton_backend/transformer_triton_backend.hpp
+++ b/src/turbomind/triton_backend/transformer_triton_backend.hpp
@@ -30,242 +30,11 @@
 
 #include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/custom_ar_comm.h"
-#include "src/turbomind/utils/instance_comm.h"
 #include "src/turbomind/utils/nccl_utils.h"
 
-namespace ft = turbomind;
+namespace turbomind {
 
-namespace triton {
-#ifdef USE_TRITONSERVER_DATATYPE
-
-#include "triton/core/tritonbackend.h"
-#include "triton/core/tritonserver.h"
-
-#ifndef TRITONSERVER_API_VERSION_MAJOR
-#error TRITONSERVER_API_VERSION_MAJOR Undefined!
-#endif
-
-#ifndef TRITONSERVER_API_VERSION_MINOR
-#error TRITONSERVER_API_VERSION_MINOR Undefined!
-#endif
-
-#if (TRITONSERVER_API_VERSION_MAJOR == 1 && TRITONSERVER_API_VERSION_MINOR >= 17)                                      \
-    || (TRITONSERVER_API_VERSION_MAJOR > 1)
-#define ENABLE_TRITON_BF16 1
-#endif
-
-typedef TRITONSERVER_DataType   DataType;
-typedef TRITONSERVER_MemoryType MemoryType;
-
-constexpr TRITONSERVER_DataType TYPE_INVALID = TRITONSERVER_TYPE_INVALID;
-constexpr TRITONSERVER_DataType TYPE_BOOL    = TRITONSERVER_TYPE_BOOL;
-constexpr TRITONSERVER_DataType TYPE_UINT8   = TRITONSERVER_TYPE_UINT8;
-constexpr TRITONSERVER_DataType TYPE_UINT16  = TRITONSERVER_TYPE_UINT16;
-constexpr TRITONSERVER_DataType TYPE_UINT32  = TRITONSERVER_TYPE_UINT32;
-constexpr TRITONSERVER_DataType TYPE_UINT64  = TRITONSERVER_TYPE_UINT64;
-constexpr TRITONSERVER_DataType TYPE_INT8    = TRITONSERVER_TYPE_INT8;
-constexpr TRITONSERVER_DataType TYPE_INT16   = TRITONSERVER_TYPE_INT16;
-constexpr TRITONSERVER_DataType TYPE_INT32   = TRITONSERVER_TYPE_INT32;
-constexpr TRITONSERVER_DataType TYPE_INT64   = TRITONSERVER_TYPE_INT64;
-constexpr TRITONSERVER_DataType TYPE_FP16    = TRITONSERVER_TYPE_FP16;
-constexpr TRITONSERVER_DataType TYPE_FP32    = TRITONSERVER_TYPE_FP32;
-constexpr TRITONSERVER_DataType TYPE_FP64    = TRITONSERVER_TYPE_FP64;
-constexpr TRITONSERVER_DataType TYPE_BYTES   = TRITONSERVER_TYPE_BYTES;
-
-#ifdef ENABLE_TRITON_BF16
-constexpr TRITONSERVER_DataType TYPE_BF16 = TRITONSERVER_TYPE_BF16;
-#endif
-constexpr TRITONSERVER_MemoryType MEMORY_CPU        = TRITONSERVER_MEMORY_CPU;
-constexpr TRITONSERVER_MemoryType MEMORY_CPU_PINNED = TRITONSERVER_MEMORY_CPU_PINNED;
-constexpr TRITONSERVER_MemoryType MEMORY_GPU        = TRITONSERVER_MEMORY_GPU;
-
-#else
-
-typedef ft::DataType   DataType;
-typedef ft::MemoryType MemoryType;
-
-constexpr DataType   TYPE_INVALID      = ft::TYPE_INVALID;
-constexpr DataType   TYPE_BOOL         = ft::TYPE_BOOL;
-constexpr DataType   TYPE_UINT8        = ft::TYPE_UINT8;
-constexpr DataType   TYPE_UINT16       = ft::TYPE_UINT16;
-constexpr DataType   TYPE_UINT32       = ft::TYPE_UINT32;
-constexpr DataType   TYPE_UINT64       = ft::TYPE_UINT64;
-constexpr DataType   TYPE_INT8         = ft::TYPE_INT8;
-constexpr DataType   TYPE_INT16        = ft::TYPE_INT16;
-constexpr DataType   TYPE_INT32        = ft::TYPE_INT32;
-constexpr DataType   TYPE_INT64        = ft::TYPE_INT64;
-constexpr DataType   TYPE_FP16         = ft::TYPE_FP16;
-constexpr DataType   TYPE_FP32         = ft::TYPE_FP32;
-constexpr DataType   TYPE_FP64         = ft::TYPE_FP64;
-constexpr DataType   TYPE_BYTES        = ft::TYPE_BYTES;
-constexpr DataType   TYPE_BF16         = ft::TYPE_BF16;
-constexpr MemoryType MEMORY_CPU        = ft::MEMORY_CPU;
-constexpr MemoryType MEMORY_CPU_PINNED = ft::MEMORY_CPU_PINNED;
-constexpr MemoryType MEMORY_GPU        = ft::MEMORY_GPU;
-
-#endif
-
-struct Tensor {
-    const MemoryType          where;
-    const DataType            type;
-    const std::vector<size_t> shape;
-    const void*               data;
-
-    Tensor(const MemoryType _where, const DataType _type, const std::vector<size_t> _shape, const void* _data):
-        where(_where), type(_type), shape(_shape), data(_data)
-    {
-    }
-
-    static ft::DataType convertTritonTypeToFt(DataType tmp_type)
-    {
-        ft::DataType ft_data_type;
-        switch (tmp_type) {
-            case TYPE_INVALID:
-                ft_data_type = ft::DataType::TYPE_INVALID;
-                break;
-            case TYPE_BOOL:
-                ft_data_type = ft::DataType::TYPE_BOOL;
-                break;
-            case TYPE_UINT8:
-                ft_data_type = ft::DataType::TYPE_UINT8;
-                break;
-            case TYPE_UINT16:
-                ft_data_type = ft::DataType::TYPE_UINT16;
-                break;
-            case TYPE_UINT32:
-                ft_data_type = ft::DataType::TYPE_UINT32;
-                break;
-            case TYPE_UINT64:
-                ft_data_type = ft::DataType::TYPE_UINT64;
-                break;
-            case TYPE_INT8:
-                ft_data_type = ft::DataType::TYPE_INT8;
-                break;
-            case TYPE_INT16:
-                ft_data_type = ft::DataType::TYPE_INT16;
-                break;
-            case TYPE_INT32:
-                ft_data_type = ft::DataType::TYPE_INT32;
-                break;
-            case TYPE_INT64:
-                ft_data_type = ft::DataType::TYPE_INT64;
-                break;
-            case TYPE_FP16:
-                ft_data_type = ft::DataType::TYPE_FP16;
-                break;
-            case TYPE_FP32:
-                ft_data_type = ft::DataType::TYPE_FP32;
-                break;
-            case TYPE_FP64:
-                ft_data_type = ft::DataType::TYPE_FP64;
-                break;
-#ifdef ENABLE_TRITON_BF16
-            case TYPE_BF16:
-                ft_data_type = ft::DataType::TYPE_BF16;
-                break;
-#endif
-            case TYPE_BYTES:
-                ft_data_type = ft::DataType::TYPE_BYTES;
-                break;
-            default:
-                FT_CHECK_WITH_INFO(false, "Unknown data type with type id: " + std::to_string(tmp_type));
-                break;
-        }
-        return ft_data_type;
-    }
-
-    ft::Tensor convertTritonTensorToFt()
-    {
-        ft::DataType   ft_data_type = convertTritonTypeToFt(type);
-        ft::MemoryType ft_memory_type;
-        switch (where) {
-            case MEMORY_CPU:
-                ft_memory_type = ft::MemoryType::MEMORY_CPU;
-                break;
-            case MEMORY_CPU_PINNED:
-                ft_memory_type = ft::MemoryType::MEMORY_CPU_PINNED;
-                break;
-            case MEMORY_GPU:
-                ft_memory_type = ft::MemoryType::MEMORY_GPU;
-                break;
-        }
-        return ft::Tensor{ft_memory_type, ft_data_type, shape, data};
-    }
-
-    static Tensor convertFtTensorToTriton(ft::Tensor ft_tensor)
-    {
-        DataType triton_data_type;
-        switch (ft_tensor.type) {
-            case TYPE_INVALID:
-                triton_data_type = TYPE_INVALID;
-                break;
-            case TYPE_BOOL:
-                triton_data_type = TYPE_BOOL;
-                break;
-            case TYPE_UINT8:
-                triton_data_type = TYPE_UINT8;
-                break;
-            case TYPE_UINT16:
-                triton_data_type = TYPE_UINT16;
-                break;
-            case TYPE_UINT32:
-                triton_data_type = TYPE_UINT32;
-                break;
-            case TYPE_UINT64:
-                triton_data_type = TYPE_UINT64;
-                break;
-            case TYPE_INT8:
-                triton_data_type = TYPE_INT8;
-                break;
-            case TYPE_INT16:
-                triton_data_type = TYPE_INT16;
-                break;
-            case TYPE_INT32:
-                triton_data_type = TYPE_INT32;
-                break;
-            case TYPE_INT64:
-                triton_data_type = TYPE_INT64;
-                break;
-            case TYPE_FP16:
-                triton_data_type = TYPE_FP16;
-                break;
-            case TYPE_FP32:
-                triton_data_type = TYPE_FP32;
-                break;
-            case TYPE_FP64:
-                triton_data_type = TYPE_FP64;
-                break;
-#ifdef ENABLE_TRITON_BF16
-            case TYPE_BF16:
-                triton_data_type = TYPE_BF16;
-                break;
-#endif
-            case TYPE_BYTES:
-                triton_data_type = TYPE_BYTES;
-                break;
-            default:
-                FT_CHECK_WITH_INFO(false, "Unknown data type with type id: " + std::to_string(ft_tensor.type));
-                break;
-        }
-        MemoryType triton_memory_type;
-        switch (ft_tensor.where) {
-            case MEMORY_CPU:
-                triton_memory_type = MEMORY_CPU;
-                break;
-            case MEMORY_CPU_PINNED:
-                triton_memory_type = MEMORY_CPU_PINNED;
-                break;
-            case MEMORY_GPU:
-                triton_memory_type = MEMORY_GPU;
-                break;
-        }
-        return Tensor{triton_memory_type, triton_data_type, ft_tensor.shape, ft_tensor.data};
-    }
-};
-
-}  // namespace triton
-
-using triton_stream_cb_t = std::function<void(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>, void*)>;
+using triton_stream_cb_t = std::function<void(std::shared_ptr<std::unordered_map<std::string, Tensor>>, void*)>;
 
 struct AbstractTransformerModel;
 struct AbstractTransformerModelInstance;
@@ -273,17 +42,8 @@ struct AbstractTransformerModelInstance;
 struct AbstractTransformerModelInstance {
     virtual ~AbstractTransformerModelInstance() = default;
 
-    virtual std::shared_ptr<std::vector<triton::Tensor>>
-    forward(std::shared_ptr<std::vector<triton::Tensor>> input_tensors) = 0;
-
-    virtual std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
-    forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors) = 0;
-
-    virtual std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
-    forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors, ft::AbstractInstanceComm*)
-    {
-        return forward(input_tensors);
-    }
+    virtual std::shared_ptr<std::unordered_map<std::string, Tensor>>
+    forward(std::shared_ptr<std::unordered_map<std::string, Tensor>> input_tensors) = 0;
 
     void registerCallback(triton_stream_cb_t cb, void* ctx)
     {
@@ -301,43 +61,38 @@ struct AbstractTransformerModelInstance {
     void*              stream_ctx_ = nullptr;
 };
 
-using TensorMap = std::unordered_map<std::string, triton::Tensor>;
-
 struct AbstractTransformerModel {
     static std::shared_ptr<AbstractTransformerModel> createLlamaModel(std::string model_dir);
 
     virtual ~AbstractTransformerModel() = default;
 
-    virtual std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>>
+    virtual std::pair<std::vector<NcclParam>, std::vector<NcclParam>>
     createNcclParams(const int node_id, const int device_id_start = 0, const bool multi_node = false);
 
-    virtual void createCustomComms(std::vector<std::shared_ptr<ft::AbstractCustomComm>>* custom_all_reduce_comms,
-                                   int                                                   world_size) = 0;
-
-    virtual std::unique_ptr<ft::AbstractInstanceComm> createInstanceComm(int size)
-    {
-        return nullptr;
-    }
+    virtual void createCustomComms(std::vector<std::shared_ptr<AbstractCustomComm>>* custom_all_reduce_comms,
+                                   int                                               world_size) = 0;
 
     virtual std::unique_ptr<AbstractTransformerModelInstance>
-    createModelInstance(int                                                               deviceId,
-                        int                                                               rank,
-                        cudaStream_t                                                      stream,
-                        std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
-                        std::shared_ptr<ft::AbstractCustomComm> custom_all_reduce_comm = nullptr) = 0;
+    createModelInstance(int                                                       deviceId,
+                        int                                                       rank,
+                        cudaStream_t                                              stream,
+                        std::pair<std::vector<NcclParam>, std::vector<NcclParam>> nccl_params,
+                        std::shared_ptr<AbstractCustomComm>                       custom_all_reduce_comm = nullptr) = 0;
 
     virtual void createSharedWeights(int deviceId, int rank) = 0;
 
-    virtual TensorMap getParams(int deviceId, int rank) = 0;
+    virtual std::unordered_map<std::string, Tensor> getParams(int deviceId, int rank) = 0;
 
     virtual void processWeights(int deviceId, int rank) = 0;
 
-    virtual void createEngine(int                                                               device_id,
-                              int                                                               rank,
-                              std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
-                              std::shared_ptr<ft::AbstractCustomComm>) = 0;
+    virtual void createEngine(int                                                       device_id,
+                              int                                                       rank,
+                              std::pair<std::vector<NcclParam>, std::vector<NcclParam>> nccl_params,
+                              std::shared_ptr<AbstractCustomComm>) = 0;
 
     virtual std::string toString()            = 0;
     virtual int         getTensorParaSize()   = 0;
     virtual int         getPipelineParaSize() = 0;
 };
+
+}  // namespace turbomind
diff --git a/src/turbomind/utils/Tensor.h b/src/turbomind/utils/Tensor.h
index 6214f6bbc2..b2b8524e09 100644
--- a/src/turbomind/utils/Tensor.h
+++ b/src/turbomind/utils/Tensor.h
@@ -515,6 +515,16 @@ class TensorMap {
         return tensor_map_.end();
     }
 
+    int count(const std::string& key) const
+    {
+        return tensor_map_.count(key);
+    }
+
+    bool empty() const
+    {
+        return tensor_map_.empty();
+    }
+
     std::string      toString();
     static TensorMap fromNpyFolder(const std::string& base_folder);
     void             saveNpy(const std::string& base_folder);
diff --git a/src/turbomind/utils/instance_comm.h b/src/turbomind/utils/instance_comm.h
deleted file mode 100644
index 5a25360a05..0000000000
--- a/src/turbomind/utils/instance_comm.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#pragma once
-
-namespace turbomind {
-
-class AbstractInstanceComm {
-public:
-    virtual ~AbstractInstanceComm() = default;
-
-    virtual void barrier() = 0;
-
-    virtual void setSharedObject(void*) = 0;
-
-    virtual void* getSharedObject() = 0;
-};
-
-}  // namespace turbomind

From ad21c4d73ac856ddc1fc96b9b54231ae266199bd Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Mon, 2 Dec 2024 13:58:27 +0800
Subject: [PATCH 28/40] add openssh-server installation in dockerfile (#2830)

* add openssh-server installation in dockerfile

* add sudo
---
 docker/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 664dc7271f..caa58ee637 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -13,7 +13,7 @@ ARG PYTHON_VERSION=3.10
 ARG TORCH_VERSION=2.3.0
 ARG TORCHVISION_VERSION=0.18.0
 
-RUN apt-get update -y && apt-get install -y software-properties-common wget vim git curl &&\
+RUN apt-get update -y && apt-get install -y software-properties-common wget vim git curl openssh-server ssh sudo &&\
     curl https://sh.rustup.rs -sSf | sh -s -- -y &&\
     add-apt-repository ppa:deadsnakes/ppa -y && apt-get update -y && apt-get install -y --no-install-recommends \
     ninja-build rapidjson-dev libgoogle-glog-dev gdb python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \

From 776677a43961cc985eb03c0197c0adf620b9ebc5 Mon Sep 17 00:00:00 2001
From: zhabuye <74179177+zhabuye@users.noreply.github.com>
Date: Mon, 2 Dec 2024 14:01:05 +0800
Subject: [PATCH 29/40] Add version restrictions in runtime_ascend.txt to
 ensure functionality  (#2836)

---
 requirements/runtime_ascend.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/runtime_ascend.txt b/requirements/runtime_ascend.txt
index d87748e396..05d74bbe72 100644
--- a/requirements/runtime_ascend.txt
+++ b/requirements/runtime_ascend.txt
@@ -1,5 +1,5 @@
 accelerate>=0.29.3
-dlinfer-ascend
+dlinfer-ascend>=0.1.2
 einops
 fastapi
 fire

From b91ce9a259d3af4bba14c05b968fdf24373545d6 Mon Sep 17 00:00:00 2001
From: AllentDan <41138331+AllentDan@users.noreply.github.com>
Date: Mon, 2 Dec 2024 15:26:29 +0800
Subject: [PATCH 30/40] Fix gemma2 accuracy through the correct softcapping
 logic (#2842)

* Fix gemma2 accuracy through the correct softcapping logic

* remove debugging codes
---
 lmdeploy/pytorch/kernels/cuda/flashattention.py | 17 ++++++++++-------
 lmdeploy/pytorch/kernels/cuda/pagedattention.py |  6 ++++--
 lmdeploy/pytorch/models/gemma.py                |  9 ++++++++-
 3 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/lmdeploy/pytorch/kernels/cuda/flashattention.py b/lmdeploy/pytorch/kernels/cuda/flashattention.py
index 7521a3e2bb..34a11ae030 100644
--- a/lmdeploy/pytorch/kernels/cuda/flashattention.py
+++ b/lmdeploy/pytorch/kernels/cuda/flashattention.py
@@ -49,7 +49,7 @@ def softcapping(qk, logit_softcapping: tl.constexpr):
 
 @triton.jit
 def _prefill_fwd_inner(acc, l_i, m_i, q, k_ptrs, v_ptrs, q1, k1_ptrs,
-                       loop_start, loop_end, qk_scale, history_mask,
+                       loop_start, loop_end, sm_scale, history_mask,
                        kv_min_loc, causal_mask: tl.constexpr,
                        window_size: tl.constexpr,
                        logit_softcapping: tl.constexpr, BLOCK_N: tl.constexpr,
@@ -71,8 +71,9 @@ def _prefill_fwd_inner(acc, l_i, m_i, q, k_ptrs, v_ptrs, q1, k1_ptrs,
             qk += tl.dot(q1, k1)
 
         if causal_mask:
-            qk *= qk_scale
+            qk *= sm_scale
             qk = softcapping(qk, logit_softcapping)
+            qk = qk * tl_log2(math.e)
             qk_mask = (history_mask[:, None]) >= (start_n + offs_n[None, :])
             if window_size > 0:
                 qk_mask = qk_mask and (
@@ -85,8 +86,9 @@ def _prefill_fwd_inner(acc, l_i, m_i, q, k_ptrs, v_ptrs, q1, k1_ptrs,
             m_i_new = tl.maximum(m_i, tl.max(qk, 1))
             qk -= m_i_new[:, None]
         elif window_size > 0:
-            qk *= qk_scale
+            qk *= sm_scale
             qk = softcapping(qk, logit_softcapping)
+            qk = qk * tl_log2(math.e)
             qk_mask = ((start_n + offs_n[None, :]) >= kv_min_loc[:, None])
             qk = tl.where(
                 qk_mask,
@@ -96,11 +98,13 @@ def _prefill_fwd_inner(acc, l_i, m_i, q, k_ptrs, v_ptrs, q1, k1_ptrs,
             m_i_new = tl.maximum(m_i, tl.max(qk, 1))
             qk -= m_i_new[:, None]
         elif logit_softcapping > 0:
-            qk *= qk_scale
+            qk *= sm_scale
             qk = softcapping(qk, logit_softcapping)
+            qk = qk * tl_log2(math.e)
             m_i_new = tl.maximum(m_i, tl.max(qk, 1))
             qk -= m_i_new[:, None]
         else:
+            qk_scale = sm_scale * tl_log2(math.e)
             m_i_new = tl.maximum(m_i, tl.max(qk, 1) * qk_scale)
             qk = qk * qk_scale - m_i_new[:, None]
 
@@ -256,7 +260,6 @@ def _flash_prefill_fwd_kernel(
     l_i = tl.zeros([BLOCK_M], dtype=tl.float32) + 1.0
     acc = tl.zeros([BLOCK_M, BLOCK_DV], dtype=tl.float32)
 
-    qk_scale = sm_scale * tl_log2(math.e)
     history_mask = history_len + start_m * BLOCK_M + tl.arange(0, BLOCK_M)
 
     loop_end = (history_len + start_m * BLOCK_M) // BLOCK_N * BLOCK_N
@@ -270,7 +273,7 @@ def _flash_prefill_fwd_kernel(
                                        k1_ptrs,
                                        loop_start,
                                        loop_end,
-                                       qk_scale,
+                                       sm_scale,
                                        history_mask,
                                        kv_min_loc,
                                        causal_mask=False,
@@ -291,7 +294,7 @@ def _flash_prefill_fwd_kernel(
                                        k1_ptrs,
                                        loop_start,
                                        loop_end,
-                                       qk_scale,
+                                       sm_scale,
                                        history_mask,
                                        kv_min_loc,
                                        causal_mask=True,
diff --git a/lmdeploy/pytorch/kernels/cuda/pagedattention.py b/lmdeploy/pytorch/kernels/cuda/pagedattention.py
index bbd6d3cf78..fe44ca4344 100644
--- a/lmdeploy/pytorch/kernels/cuda/pagedattention.py
+++ b/lmdeploy/pytorch/kernels/cuda/pagedattention.py
@@ -205,11 +205,12 @@ def _fwd_grouped_split_kernel(
         qk += tl.dot(q, k)
         if BLOCK_DMODEL1 != 0:
             qk += tl.dot(q1, k1)
-        qk *= sm_scale * tl_log2(math.e)
+        qk *= sm_scale
         if logit_softcapping > 0.0:
             qk = qk / logit_softcapping
             qk = tanh(qk)
             qk = qk * logit_softcapping
+        qk = qk * tl_log2(math.e)
         # NOTE: inf - inf = nan, and nan will leads to error
         if start_n + BLOCK_N > history_len or window_size > 0:
             qk_mask = history_len >= (start_n + offs_n)
@@ -491,11 +492,12 @@ def _fwd_grouped_split_quant_kernel(
         qk += tl.dot(q, k)
         if BLOCK_DMODEL1 != 0:
             qk += tl.dot(q1, k1)
-        qk *= sm_scale * tl_log2(math.e)
+        qk *= sm_scale
         if logit_softcapping > 0.0:
             qk = qk / logit_softcapping
             qk = tanh(qk)
             qk = qk * logit_softcapping
+        qk = qk * tl_log2(math.e)
         # NOTE: inf - inf = nan, and nan will leads to error
         if start_n + BLOCK_N > history_len or window_size > 0:
             qk_mask = history_len >= (start_n + offs_n)
diff --git a/lmdeploy/pytorch/models/gemma.py b/lmdeploy/pytorch/models/gemma.py
index 450767bda3..ca36f15651 100644
--- a/lmdeploy/pytorch/models/gemma.py
+++ b/lmdeploy/pytorch/models/gemma.py
@@ -383,6 +383,8 @@ def __init__(self,
                                             bias=False,
                                             dtype=dtype,
                                             device=device)
+        self.final_logit_softcapping = getattr(config,
+                                               'final_logit_softcapping', None)
 
     def forward(
         self,
@@ -405,7 +407,12 @@ def forward(
 
     def get_logits(self, hidden_states: torch.Tensor):
         """compute logits of the model output."""
-        return self.lm_head(hidden_states)
+        logits = self.lm_head(hidden_states)
+        if self.final_logit_softcapping is not None:
+            logits = logits / self.final_logit_softcapping
+            logits = torch.tanh(logits)
+            logits = logits * self.final_logit_softcapping
+        return logits
 
     def get_input_embeddings(self):
         """get input embeddings."""

From c158d1877bc31aeb49f8e1b16536a882246bc130 Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Mon, 2 Dec 2024 16:09:00 +0800
Subject: [PATCH 31/40] fix accessing before initialization (#2845)

* fix accessing before initialization

* fix linting
---
 .../models/llama/LlamaDecoderLayerWeight.cc   | 44 +++++++++----------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
index 0a2a3be175..393a6a0e87 100644
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
@@ -68,6 +68,28 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int               layer_id,
     tensor_para_size_(tp_size),
     tensor_para_rank_(tp_rank)
 {
+    self_attn_weights = LlamaAttentionWeight<T>{hidden_units_,
+                                                size_per_head_,
+                                                head_num_,
+                                                kv_head_num_,
+                                                model.mla,
+                                                attn_bias_,
+                                                tensor_para_size_,
+                                                weight_type_,
+                                                model.group_size};
+
+    ffn_weights = LlamaFfnWeight<T>{
+        hidden_units_,
+        inter_size_,
+        tensor_para_size_,
+        weight_type_,
+        model.group_size,
+        weight_type_ == WeightType::kINT4 && is_fuse_silu_act(),
+    };
+
+    moe_weights = MoeFfnWeight<T>{
+        layer_id, moe_param, hidden_units_, weight_type_, model.group_size, tensor_para_size_, is_fuse_silu_act()};
+
     if (lora_param.policy == LoraPolicy::kPlora) {
         std::vector<std::string> keys = {
             "attention.w_qkv", "attention.wo", "feed_forward.w1", "feed_forward.w2", "feed_forward.w3"};
@@ -106,28 +128,6 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int               layer_id,
     }
 
     fused_up_and_gate_ = ffn_weights.gating.lora.policy != LoraPolicy::kPlora;
-
-    self_attn_weights = LlamaAttentionWeight<T>{hidden_units_,
-                                                size_per_head_,
-                                                head_num_,
-                                                kv_head_num_,
-                                                model.mla,
-                                                attn_bias_,
-                                                tensor_para_size_,
-                                                weight_type_,
-                                                model.group_size};
-
-    ffn_weights = LlamaFfnWeight<T>{
-        hidden_units_,
-        inter_size_,
-        tensor_para_size_,
-        weight_type_,
-        model.group_size,
-        weight_type_ == WeightType::kINT4 && is_fuse_silu_act(),
-    };
-
-    moe_weights = MoeFfnWeight<T>{
-        layer_id, moe_param, hidden_units_, weight_type_, model.group_size, tensor_para_size_, is_fuse_silu_act()};
 }
 
 template<typename T>

From 986ad17c173d2052cb9b6eb7a8e866cf917e6991 Mon Sep 17 00:00:00 2001
From: q yao <streetyao@live.com>
Date: Mon, 2 Dec 2024 16:18:43 +0800
Subject: [PATCH 32/40] better kv allocate (#2814)

* better allocate

* update max session len
---
 lmdeploy/pytorch/engine/cache_engine.py    | 135 +++++++++------------
 lmdeploy/pytorch/engine/engine.py          |  26 +++-
 lmdeploy/pytorch/engine/engine_instance.py |  13 +-
 lmdeploy/pytorch/engine/model_agent.py     |   4 +-
 4 files changed, 80 insertions(+), 98 deletions(-)

diff --git a/lmdeploy/pytorch/engine/cache_engine.py b/lmdeploy/pytorch/engine/cache_engine.py
index 8eaa563947..e393adeed3 100644
--- a/lmdeploy/pytorch/engine/cache_engine.py
+++ b/lmdeploy/pytorch/engine/cache_engine.py
@@ -54,7 +54,7 @@ def __init__(
         self.cache_stream = torch.cuda.Stream()
         assert self.cache_stream != torch.cuda.current_stream()
         # Initialize the events for stream synchronization.
-        self.events = [torch.cuda.Event() for _ in range(self.num_layers)]
+        self.events = torch.cuda.Event()
 
         logger.debug(
             f'Initialize cache engine with {cache_config.num_gpu_blocks}'
@@ -156,80 +156,60 @@ def get_value_block_shape(self,
             local=local,
         )
 
-    def allocate_gpu_cache(self):
-        """allocate caches on GPU."""
-        gpu_cache: List[KVCache] = []
+    def _allocate_cache(self, num_blocks: int, device: torch.device):
+        """allocate cache implement."""
         key_block_shape = self.get_key_block_shape(local=True)
         value_block_shape = self.get_value_block_shape(local=True)
 
-        for _ in range(self.num_layers):
-            key_blocks = torch.empty(
-                size=(self.num_gpu_blocks, *key_block_shape),
-                dtype=self.kv_cache_dtype,
-                device='cuda',
+        num_layers = self.num_layers
+        kv_cache_dtype = self.kv_cache_dtype
+
+        key_cache = torch.empty(
+            size=(num_layers, num_blocks, *key_block_shape),
+            dtype=kv_cache_dtype,
+            device=device,
+        )
+        value_cache = torch.empty(
+            size=(num_layers, num_blocks, *value_block_shape),
+            dtype=kv_cache_dtype,
+            device=device,
+        )
+
+        output = (key_cache, value_cache)
+
+        if self.cache_config.quant_policy in (4, 8):
+            dtype = self.model_config.dtype
+            key_sz_cache = torch.empty(
+                size=(num_layers, num_blocks, *key_block_shape[:-1], 2),
+                dtype=dtype,
+                device=device,
             )
-            value_blocks = torch.empty(
-                size=(self.num_gpu_blocks, *value_block_shape),
-                dtype=self.kv_cache_dtype,
-                device='cuda',
+            val_sz_cache = torch.empty(
+                size=(num_layers, num_blocks, *value_block_shape[:-1], 2),
+                dtype=dtype,
+                device=device,
             )
-            if self.cache_config.quant_policy in (4, 8):
-                key_scales_zeros = torch.empty(
-                    size=(self.num_gpu_blocks, *key_block_shape[:-1], 2),
-                    dtype=self.model_config.dtype,
-                    device='cuda',
-                )
-                value_scales_zeros = torch.empty(
-                    size=(self.num_gpu_blocks, *value_block_shape[:-1], 2),
-                    dtype=self.model_config.dtype,
-                    device='cuda',
-                )
-                gpu_cache.append((key_blocks, value_blocks, key_scales_zeros,
-                                  value_scales_zeros))
-            else:
-                gpu_cache.append((key_blocks, value_blocks))
-
-        return gpu_cache
+            output = output + (key_sz_cache, val_sz_cache)
+
+        return output
+
+    def allocate_gpu_cache(self):
+        """allocate caches on GPU."""
+        caches = self._allocate_cache(self.num_gpu_blocks, 'cuda')
+        self.full_gpu_cache = caches
+        self.local_gpu_cache = list(zip(*caches))
+        return self.local_gpu_cache
 
     def allocate_cpu_cache(self):
         """allocate caches on Host."""
-        cpu_cache: List[KVCache] = []
-        key_block_shape = self.get_key_block_shape(local=True)
-        value_block_shape = self.get_value_block_shape(local=True)
-
-        # TODO: pin memory might need be banned on wsl
-        pin_memory = True
+        caches = self._allocate_cache(self.num_gpu_blocks, 'cpu')
 
-        for _ in range(self.num_layers):
-            key_blocks = torch.empty(
-                size=(self.num_cpu_blocks, *key_block_shape),
-                dtype=self.kv_cache_dtype,
-                pin_memory=pin_memory,
-            )
-            value_blocks = torch.empty(
-                size=(self.num_cpu_blocks, *value_block_shape),
-                dtype=self.kv_cache_dtype,
-                pin_memory=pin_memory,
-            )
-            if self.cache_config.quant_policy in (4, 8):
-                key_scales_zeros = torch.empty(
-                    size=(self.num_cpu_blocks, *key_block_shape[:-1], 2),
-                    dtype=self.model_config.dtype,
-                    pin_memory=pin_memory,
-                )
-                value_scales_zeros = torch.empty(
-                    size=(self.num_cpu_blocks, *value_block_shape[:-1], 2),
-                    dtype=self.model_config.dtype,
-                    pin_memory=pin_memory,
-                )
-                cpu_cache.append((key_blocks, value_blocks, key_scales_zeros,
-                                  value_scales_zeros))
-            else:
-                cpu_cache.append((key_blocks, value_blocks))
-        return cpu_cache
+        self.full_cpu_cache = caches
+        self.local_cpu_cache = list(zip(*caches))
+        return self.local_cpu_cache
 
     @torch.inference_mode()
-    def _swap(self, src: List[KVCache], dst: List[KVCache],
+    def _swap(self, src: List[torch.Tensor], dst: List[torch.Tensor],
               src_to_dst: Dict[int, int]):
         """Move caches from src memory to dst memory.
 
@@ -238,18 +218,19 @@ def _swap(self, src: List[KVCache], dst: List[KVCache],
             dst (List[KVCache]): Destination cache.
             src_to_dst (Dict[int, int]): Map between src and dst.
         """
+        BLOCKS_PER_COPY = 2
+        num_copy = len(src_to_dst)
+        src_idx, dst_idx = list(zip(*src_to_dst.items()))
+        src_idx = torch.tensor(src_idx, device=src[0].device)
+        dst_idx = torch.tensor(dst_idx, device=dst[0].device)
         with torch.cuda.stream(self.cache_stream):
-            for i in range(self.num_layers):
-                src_key_cache, src_value_cache = src[i]
-                dst_key_cache, dst_value_cache = dst[i]
-
-                for src_id, dst_id in src_to_dst.items():
-                    if isinstance(dst_key_cache[dst_id], torch.Tensor):
-                        dst_key_cache[dst_id].copy_(src_key_cache[src_id])
-                        dst_value_cache[dst_id].copy_(src_value_cache[src_id])
-
-                    event = self.events[i]
-                    event.record(stream=self.cache_stream)
+            for scache, dcache in zip(src, dst):
+                for idx in range(0, num_copy, BLOCKS_PER_COPY):
+                    sidx = src_idx[idx:idx + BLOCKS_PER_COPY]
+                    didx = dst_idx[idx:idx + BLOCKS_PER_COPY]
+                    sdata = scache[:, sidx]
+                    dcache.index_copy_(1, didx, sdata.to(dcache.device))
+            self.events.record(stream=self.cache_stream)
 
     def swap_in(self, src_to_dst: Dict[int, int]) -> None:
         """Move cache from Host to Device.
@@ -257,7 +238,7 @@ def swap_in(self, src_to_dst: Dict[int, int]) -> None:
         Args:
             src_to_dst (Dict[int, int]): Map between src and dst.
         """
-        self._swap(self.local_cpu_cache, self.local_gpu_cache, src_to_dst)
+        self._swap(self.full_cpu_cache, self.full_gpu_cache, src_to_dst)
 
     def swap_out(self, src_to_dst: Dict[int, int]) -> None:
         """Move cache from Device to Host.
@@ -265,7 +246,7 @@ def swap_out(self, src_to_dst: Dict[int, int]) -> None:
         Args:
             src_to_dst (Dict[int, int]): Map between src and dst.
         """
-        self._swap(self.local_gpu_cache, self.local_cpu_cache, src_to_dst)
+        self._swap(self.full_gpu_cache, self.full_cpu_cache, src_to_dst)
 
     @classmethod
     def get_cache_block_size(cls,
diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py
index cffe13bbdb..26b507e9d4 100644
--- a/lmdeploy/pytorch/engine/engine.py
+++ b/lmdeploy/pytorch/engine/engine.py
@@ -164,6 +164,7 @@ def __init__(self,
         self.cache_config = cache_config
         self.backend_config = backend_config
         self.stream = self.model_agent.stream
+        self.max_session_len = self._get_max_session_len()
 
         self.req_manager = self._bind_request_manager()
 
@@ -261,6 +262,20 @@ def _response(self,
                      data=data,
                      err_msg=err_msg))
 
+    def _get_max_session_len(self):
+        """get max session len."""
+        session_len = self.scheduler_config.max_session_len
+        max_tokens = (self.cache_config.num_gpu_blocks *
+                      self.cache_config.block_size)
+        window_size = self.cache_config.window_size
+        if window_size > 0 and window_size <= max_tokens:
+            max_tokens = (1 << 63) - 1
+        if session_len is None:
+            session_len = max_tokens
+        else:
+            session_len = min(max_tokens, session_len)
+        return session_len
+
     def _on_add_session(self, reqs: Request, **kwargs):
         """on add session callback."""
         for req in reqs:
@@ -315,12 +330,11 @@ def __update_bad_words(msg):
 
         def __update_max_new_tokens(msg):
             """update max new tokens."""
-            max_session_len = self.scheduler_config.max_session_len
-            if max_session_len is not None:
-                sampling_param = msg.sampling_param
-                sampling_param.max_new_tokens = min(
-                    sampling_param.max_new_tokens,
-                    max_session_len - msg.num_all_tokens())
+            max_session_len = self.max_session_len
+            sampling_param = msg.sampling_param
+            sampling_param.max_new_tokens = min(
+                sampling_param.max_new_tokens,
+                max_session_len - msg.num_all_tokens())
 
         for req in reqs:
             session_id = req.data['session_id']
diff --git a/lmdeploy/pytorch/engine/engine_instance.py b/lmdeploy/pytorch/engine/engine_instance.py
index 3e741c7ba2..455ab1ccb3 100644
--- a/lmdeploy/pytorch/engine/engine_instance.py
+++ b/lmdeploy/pytorch/engine/engine_instance.py
@@ -89,21 +89,10 @@ class EngineInstance:
     """
 
     def __init__(self, engine: Engine):
-
-        def __get_max_input_len(engine):
-            """get max input len."""
-            cache_config = engine.cache_config
-            max_input_len = (cache_config.block_size *
-                             cache_config.num_gpu_blocks)
-            window_size = cache_config.window_size
-            if window_size > 0 and window_size <= max_input_len:
-                max_input_len = (1 << 63) - 1
-            return max_input_len
-
         self.engine = engine
         self.req_sender = engine.req_manager.build_sender()
 
-        self.max_input_len = __get_max_input_len(self.engine)
+        self.max_input_len = self.engine.max_session_len
 
     def __del__(self):
         """Destructor."""
diff --git a/lmdeploy/pytorch/engine/model_agent.py b/lmdeploy/pytorch/engine/model_agent.py
index 74938de812..2877f59375 100644
--- a/lmdeploy/pytorch/engine/model_agent.py
+++ b/lmdeploy/pytorch/engine/model_agent.py
@@ -120,9 +120,7 @@ def cache_swapping(cache_engine: CacheEngine, swap_in_map: dict,
         issued_cache_op = True
 
     if issued_cache_op:
-        cache_events = cache_engine.events
-        for event in cache_events:
-            event.wait()
+        cache_engine.events.wait()
 
 
 @torch.inference_mode()

From 6734c71ffc0e94323854eb6ed139dbe621e71a9d Mon Sep 17 00:00:00 2001
From: AllentDan <41138331+AllentDan@users.noreply.github.com>
Date: Mon, 2 Dec 2024 19:22:01 +0800
Subject: [PATCH 33/40] Update internvl chat template (#2832)

* Add internvl2-5 chat template

* fix template, using original internlm2 template
---
 lmdeploy/model.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index 47aaaa4e88..a4355ea131 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -597,9 +597,32 @@ def match(cls, model_path: str) -> Optional[str]:
         path = model_path.lower()
         if ('internvl2' in path
                 and 'internvl2-4b' not in path) or 'mono-internvl' in path:
+            if 'internvl2.5' in path or 'internvl2_5' in path:
+                return None
             return 'internvl2-internlm2'
 
 
+@MODELS.register_module(name='internvl2_5')
+class InternVL2_5(InternLM2Chat7B):
+
+    def __init__(
+            self,
+            meta_instruction='你是书生·万象，英文名是InternVL，是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。',  # noqa
+            **kwargs):
+        super().__init__(meta_instruction=meta_instruction, **kwargs)
+
+    @classmethod
+    def match(cls, model_path: str) -> Optional[str]:
+        """Return the model_name that was registered to MODELS.
+
+        Args:
+            model_path (str): the model path used for matching.
+        """
+        path = model_path.lower()
+        if 'internvl2.5' in path or 'internvl2_5' in path:
+            return 'internvl2_5'
+
+
 @MODELS.register_module(name=['internlm-xcomposer2', 'internlm-xcomposer2d5'])
 class InternLMXComposer2Chat7B(InternLMChat7B):
     """Chat template and generation parameters of InternLM-XComposer2-7b."""

From 8fbfed685f328c7fff6ec46a17dfcd0a50d2a685 Mon Sep 17 00:00:00 2001
From: q yao <streetyao@live.com>
Date: Tue, 3 Dec 2024 11:14:35 +0800
Subject: [PATCH 34/40] profile throughput without new threads (#2826)

* profile throughput without threads

* optimize main loop

* fix torch.event

* fix python>3.11

* optimize tp

* reduce cudagraph copy

* optimize fill kv cache

* optimize silu and mul

* optimize apply rotary

* remove executor

* remove kernel

* remove num_heads==1
---
 benchmark/profile_throughput.py               |  38 ++--
 lmdeploy/pytorch/backends/cuda/attention.py   |   5 +-
 lmdeploy/pytorch/engine/engine.py             |  22 ++-
 lmdeploy/pytorch/engine/logits_process.py     |  30 ++-
 lmdeploy/pytorch/engine/model_agent.py        |  65 +------
 lmdeploy/pytorch/kernels/cuda/activation.py   | 105 +++-------
 .../kernels/cuda/apply_rotary_pos_emb.py      |  43 +----
 .../pytorch/kernels/cuda/fill_kv_cache.py     | 182 ++++++------------
 lmdeploy/pytorch/models/utils/cudagraph.py    |  28 ++-
 tests/pytorch/engine/test_logits_process.py   |   3 +-
 10 files changed, 177 insertions(+), 344 deletions(-)

diff --git a/benchmark/profile_throughput.py b/benchmark/profile_throughput.py
index 58786d9c80..4f06fad4f9 100644
--- a/benchmark/profile_throughput.py
+++ b/benchmark/profile_throughput.py
@@ -1,12 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import argparse
+import asyncio
 import csv
 import json
 import os
 import random
 import time
 from queue import Queue
-from threading import Thread
 from typing import List, Tuple, Union
 
 import numpy as np
@@ -86,15 +86,15 @@ def __init__(self, model_path: str,
         self.csv = csv
         self.pbar = None
 
-    def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int,
-                   temperature: float, top_p: float, top_k: int,
-                   stream_output: bool):
+    async def _inference(self, req_queue: Queue, res_queue: Queue,
+                         session_id: int, temperature: float, top_p: float,
+                         top_k: int, stream_output: bool):
         model_inst = self.tm_model.create_instance()
         stats = []
         # get each generated token's latency
         per_token_latency_stats = []
         for prompt, input_seqlen, output_seqlen in iter(
-                req_queue.get, [None, None, None]):
+                req_queue.get_nowait, [None, None, None]):
             _per_token_latency_stats = [0] * (output_seqlen + 1)
             prev = time.perf_counter()
             n_prev_token = 0
@@ -102,7 +102,7 @@ def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int,
             input_ids = self.tokenizer(prompt).input_ids
             state = DetokenizeState(len(input_ids))
 
-            for outputs in model_inst.stream_infer(
+            async for outputs in model_inst.async_stream_infer(
                     session_id,
                     input_ids=input_ids,
                     gen_config=GenerationConfig(max_new_tokens=output_seqlen,
@@ -123,7 +123,7 @@ def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int,
                 prev = now
             # for pytorch engine to restart a session
             if isinstance(model_inst, EngineInstance):
-                model_inst.end(session_id)
+                await model_inst.async_end(session_id)
             assert output_seqlen <= n_token <= output_seqlen + 1, \
                 f'Error. session_id({session_id}) request {output_seqlen} ' \
                 f'tokens, but generate {n_token} tokens.\n' \
@@ -139,13 +139,12 @@ def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int,
             # skip the first token latency
             per_token_latency_stats.append(_per_token_latency_stats[1:])
             self.pbar.update(1)
-        res_queue.put((session_id, stats, per_token_latency_stats))
+        res_queue.put_nowait((session_id, stats, per_token_latency_stats))
 
     def process_request(self, requests, concurrency, temperature, top_p, top_k,
                         stream_output):
         res_queue = Queue()
         req_queue = Queue()
-        threads = []
 
         self.pbar = tqdm(total=len(requests))
 
@@ -157,18 +156,20 @@ def process_request(self, requests, concurrency, temperature, top_p, top_k,
 
         start = time.time()
 
+        event_loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(event_loop)
+
         # start threads
+        tasks = []
         for i in range(concurrency):
-            t = Thread(target=self._inference,
-                       args=(req_queue, res_queue, i, temperature, top_p,
-                             top_k, stream_output),
-                       daemon=True)
-            t.start()
-            threads.append(t)
+            task = self._inference(req_queue, res_queue, i, temperature, top_p,
+                                   top_k, stream_output)
+            tasks.append(task)
+
+        async def _gather_tasks(tasks):
+            return await asyncio.gather(*tasks)
 
-        # wait for finish
-        for t in threads:
-            t.join()
+        event_loop.run_until_complete(_gather_tasks(tasks))
 
         elapsed_time = time.time() - start
 
@@ -333,7 +334,6 @@ def main():
             block_size=args.cache_block_seq_len,
             max_batch_size=args.concurrency,
             tp=args.tp,
-            thread_safe=True,
             eager_mode=args.eager_mode,
             enable_prefix_caching=args.enable_prefix_caching,
             quant_policy=args.quant_policy,
diff --git a/lmdeploy/pytorch/backends/cuda/attention.py b/lmdeploy/pytorch/backends/cuda/attention.py
index d01d6fe9b4..8261b869f0 100644
--- a/lmdeploy/pytorch/backends/cuda/attention.py
+++ b/lmdeploy/pytorch/backends/cuda/attention.py
@@ -94,7 +94,10 @@ def forward(
         kv_seqlens = attn_metadata.kv_seqlens
         kv_flatten_size = attn_metadata.kv_flatten_size
         quant_policy = attn_metadata.quant_policy
-        max_q_seqlen = query.numel() // (query.size(-1) * query.size(-2))
+        if attn_metadata.is_decoding:
+            max_q_seqlen = 1
+        else:
+            max_q_seqlen = query.numel() // (query.size(-1) * query.size(-2))
         fill_max_q_seqlen = max_q_seqlen
         if attn_metadata.fill_seqlens is not None:
             fill_seqlens = attn_metadata.fill_seqlens
diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py
index 26b507e9d4..b7a803a7a7 100644
--- a/lmdeploy/pytorch/engine/engine.py
+++ b/lmdeploy/pytorch/engine/engine.py
@@ -172,6 +172,7 @@ def __init__(self,
         self._start_loop()
         self._create_buffers()
         self.engine_instance = self.create_instance()
+        self._output_stream = torch.cuda.Stream()
 
     @classmethod
     def from_pretrained(cls,
@@ -673,7 +674,8 @@ async def __long_context_single_forward(inputs):
         return ret
 
     def _make_infer_outputs(self, next_token_ids: torch.LongTensor,
-                            logits: torch.Tensor, stopped: torch.Tensor):
+                            logits: torch.Tensor, stopped: torch.Tensor,
+                            event: torch.cuda.Event):
         """make infer output."""
 
         def __get_out_token_ids(token: torch.Tensor, msg: SchedulerSequence,
@@ -694,6 +696,11 @@ def __get_q_start_loc():
             else:
                 return seq_length.cumsum(0) - seq_length
 
+        with torch.cuda.stream(self._output_stream):
+            event.wait()
+            next_token_ids = next_token_ids.cpu()
+            stopped = stopped.cpu()
+
         running = self._running
         is_run = [seq.status == MessageStatus.RUNNING for seq in running]
         stopped = stopped.tolist()
@@ -755,6 +762,8 @@ def __update_inputs(next_token_ids):
         logger.debug('<ForwardTask>: '
                      f'batch_size={inputs.seq_length.size(0)} '
                      f'num_tokens={inputs.input_ids.size(-1)}')
+        if self.gpu_count == 1:
+            inputs = inputs.to_device('cuda')
         is_decoding = inputs.is_decoding
         if all_ids is not None:
             all_ids = all_ids.cuda()
@@ -785,10 +794,11 @@ def __update_inputs(next_token_ids):
                 next_token_ids, sampling_inputs.stop_words, num_appendable_ids)
 
             # send output
-            stopped = stopped.cpu()
-            finish = stopped.all().item() or (idx == loop_count - 1)
+            finish = (idx == loop_count - 1)
             finish = finish or _check_finish(self.scheduler, idx)
-            output = (next_token_ids.cpu(), logits, stopped)
+            event = torch.cuda.Event()
+            event.record()
+            output = (next_token_ids, logits, stopped, event)
             output_que.put_nowait((finish, output))
 
             if finish:
@@ -951,9 +961,9 @@ async def __step():
                 try:
                     if isinstance(out, Exception):
                         raise out
-                    next_token_ids, logits, stopped = out
+                    next_token_ids, logits, stopped, event = out
                     step_outputs = self._make_infer_outputs(
-                        next_token_ids, logits, stopped)
+                        next_token_ids, logits, stopped, event)
                     __send_resps(step_outputs)
                 except Exception as e:
                     raise e
diff --git a/lmdeploy/pytorch/engine/logits_process.py b/lmdeploy/pytorch/engine/logits_process.py
index 54740a4fb3..24cb336d71 100644
--- a/lmdeploy/pytorch/engine/logits_process.py
+++ b/lmdeploy/pytorch/engine/logits_process.py
@@ -21,10 +21,9 @@ def _process_temperature_(scores: torch.Tensor, temperature: torch.Tensor):
 
 def _process_bad_words_(scores: torch.Tensor,
                         bad_words: torch.LongTensor,
+                        mask: torch.BoolTensor,
                         filter_value: float = -float('inf')):
     """process bad words."""
-    mask = bad_words >= 0
-    bad_words = bad_words.where(mask, 0)
     filtered_scores = scores.gather(1, bad_words)
     filtered_scores[mask] = filter_value
     scores.scatter_(1, bad_words, filtered_scores)
@@ -127,7 +126,9 @@ def _guided_sampling(response_formats: Tuple[Dict], scores: torch.Tensor,
 class SamplingInputs:
     temperature: torch.Tensor = None
     bad_words: torch.LongTensor = None
+    bad_mask: torch.BoolTensor = None
     stop_words: torch.LongTensor = None
+    stop_mask: torch.BoolTensor = None
     repetition_penalty: torch.Tensor = None
     top_k: torch.LongTensor = None
     top_p: torch.Tensor = None
@@ -200,9 +201,11 @@ def __get_bad_words(bad_words):
             """get bad words."""
             max_bw_len = max(len(bw) for bw in bad_words)
             if max_bw_len == 0:
-                return None
+                return None, None
             if all(len(bw) == max_bw_len for bw in bad_words):
-                return torch.tensor(bad_words)
+                ret = torch.tensor(bad_words)
+                mask = torch.ones_like(ret, dtype=bool)
+                return ret, mask
             ret = torch.full((batch_size, max_bw_len), -1, dtype=torch.int64)
             for idx, bw in enumerate(bad_words):
                 bw_len = len(bw)
@@ -210,7 +213,10 @@ def __get_bad_words(bad_words):
                     continue
                 bw = ret.new_tensor(bw)
                 ret[idx, :bw_len] = bw
-            return ret
+
+            mask = ret >= 0
+            ret = ret.where(mask, 0)
+            return ret, mask
 
         __gather_params()
 
@@ -221,8 +227,8 @@ def __get_bad_words(bad_words):
 
         temperature = torch.tensor(temperature)
 
-        bad_words = __get_bad_words(bad_words)
-        stop_words = __get_bad_words(stop_words)
+        bad_words, bad_mask = __get_bad_words(bad_words)
+        stop_words, stop_mask = __get_bad_words(stop_words)
 
         max_top_k = max(top_k)
         if min(top_k) <= 0:
@@ -243,7 +249,9 @@ def __get_bad_words(bad_words):
         sampling_input = cls(
             temperature=temperature,
             bad_words=bad_words,
+            bad_mask=bad_mask,
             stop_words=stop_words,
+            stop_mask=stop_mask,
             repetition_penalty=repetition_penalty,
             top_k=top_k,
             top_p=top_p,
@@ -326,12 +334,14 @@ def __call__(self, all_ids: torch.LongTensor,
 
         bad_words = sampling_inputs.bad_words
         if bad_words is not None:
-            scores = _process_bad_words_(scores, bad_words)
+            bad_mask = sampling_inputs.bad_mask
+            scores = _process_bad_words_(scores, bad_words, bad_mask)
 
         stop_words = sampling_inputs.stop_words
         if stop_words is not None:
-            stop_words = torch.where(self.ignore_eos[:, None], stop_words, -1)
-            scores = _process_bad_words_(scores, stop_words)
+            stop_mask = sampling_inputs.stop_mask
+            stop_mask = torch.where(self.ignore_eos[:, None], stop_mask, False)
+            scores = _process_bad_words_(scores, stop_words, stop_mask)
 
         scores = _guided_sampling(sampling_inputs.response_formats, scores,
                                   guided_input_ids, self.tokenizer)
diff --git a/lmdeploy/pytorch/engine/model_agent.py b/lmdeploy/pytorch/engine/model_agent.py
index 2877f59375..59d77f264a 100644
--- a/lmdeploy/pytorch/engine/model_agent.py
+++ b/lmdeploy/pytorch/engine/model_agent.py
@@ -162,10 +162,6 @@ def __init__(self, model_config: ModelConfig, cache_config: CacheConfig):
         self.model_config = model_config
         self.cache_config = cache_config
 
-    def get_block_numel(self):
-        """get block nelement."""
-        raise NotImplementedError('Not implemented')
-
     async def async_forward(self, inputs: ModelInputs, swap_in_map: SwapMap,
                             swap_out_map: SwapMap):
         """model forward.
@@ -177,17 +173,6 @@ async def async_forward(self, inputs: ModelInputs, swap_in_map: SwapMap,
         """
         raise NotImplementedError('Not implemented.')
 
-    def forward(self, inputs: ModelInputs, swap_in_map: SwapMap,
-                swap_out_map: SwapMap):
-        """model forward.
-
-        Args:
-            inputs (Dict): The input data comes from _make_inputs.
-            swap_in_map (SwapMap): Cache maps to swap in.
-            swap_out_map (SwapMap): Cache maps to swap out.
-        """
-        raise NotImplementedError('Not implemented.')
-
     def get_logits(self, hidden_states: torch.Tensor):
         """get logits of model output."""
         raise NotImplementedError('Not implemented.')
@@ -255,11 +240,6 @@ def _build_model(self,
                          device=device)
         return patched_model
 
-    def get_block_numel(self):
-        """get block nelement."""
-        k_cache = self.cache_engine.local_gpu_cache[0][0]
-        return k_cache[0].numel()
-
     def _forward_impl(self, inputs: ModelInputs, swap_in_map: SwapMap,
                       swap_out_map: SwapMap):
         cache_swapping(self.cache_engine,
@@ -274,21 +254,6 @@ def _forward_impl(self, inputs: ModelInputs, swap_in_map: SwapMap,
         )
         return output
 
-    def forward(self, inputs: ModelInputs, swap_in_map: SwapMap,
-                swap_out_map: SwapMap):
-        """model forward.
-
-        Args:
-            inputs (Dict): The input data comes from _make_inputs.
-            swap_in_map (SwapMap): Cache maps to swap in.
-            swap_out_map (SwapMap): Cache maps to swap out.
-        """
-        output = self._forward_impl(inputs,
-                                    swap_in_map=swap_in_map,
-                                    swap_out_map=swap_out_map)
-        self.stream.synchronize()
-        return output
-
     async def async_forward(self, inputs: ModelInputs, swap_in_map: SwapMap,
                             swap_out_map: SwapMap):
         """model forward.
@@ -301,8 +266,9 @@ async def async_forward(self, inputs: ModelInputs, swap_in_map: SwapMap,
         output = self._forward_impl(inputs,
                                     swap_in_map=swap_in_map,
                                     swap_out_map=swap_out_map)
-        await asyncio.get_event_loop().run_in_executor(None,
-                                                       self.stream.synchronize)
+        await asyncio.sleep(0)
+        while not self.stream.query():
+            await asyncio.sleep(0)
         return output
 
     def get_logits(self, hidden_states: torch.Tensor):
@@ -688,11 +654,6 @@ def _build_model(
 
         return model, cache_engine, cache_config
 
-    def get_block_numel(self):
-        """get block nelement."""
-        k_cache = self.cache_engine.local_gpu_cache[0][0]
-        return k_cache[0].numel()
-
     def _forward_impl(self, inputs: ModelInputs, swap_in_map: SwapMap,
                       swap_out_map: SwapMap):
         """forward impl."""
@@ -713,21 +674,6 @@ def _forward_impl(self, inputs: ModelInputs, swap_in_map: SwapMap,
             )
         return output
 
-    def forward(self, inputs: ModelInputs, swap_in_map: SwapMap,
-                swap_out_map: SwapMap):
-        """model forward.
-
-        Args:
-            inputs (Dict): The input data comes from _make_inputs.
-            swap_in_map (SwapMap): Cache maps to swap in.
-            swap_out_map (SwapMap): Cache maps to swap out.
-        """
-        output = self._forward_impl(inputs,
-                                    swap_in_map=swap_in_map,
-                                    swap_out_map=swap_out_map)
-        self.stream.synchronize()
-        return output
-
     async def async_forward(self, inputs: ModelInputs, swap_in_map: SwapMap,
                             swap_out_map: SwapMap):
         """model forward.
@@ -740,8 +686,9 @@ async def async_forward(self, inputs: ModelInputs, swap_in_map: SwapMap,
         output = self._forward_impl(inputs,
                                     swap_in_map=swap_in_map,
                                     swap_out_map=swap_out_map)
-        await asyncio.get_event_loop().run_in_executor(None,
-                                                       self.stream.synchronize)
+        await asyncio.sleep(0)
+        while not self.stream.query():
+            await asyncio.sleep(0)
         return output
 
     def get_logits(self, hidden_states: torch.Tensor):
diff --git a/lmdeploy/pytorch/kernels/cuda/activation.py b/lmdeploy/pytorch/kernels/cuda/activation.py
index 2533840a95..9a00e7354f 100644
--- a/lmdeploy/pytorch/kernels/cuda/activation.py
+++ b/lmdeploy/pytorch/kernels/cuda/activation.py
@@ -7,10 +7,8 @@
 TRITON_VERSION = version.parse(triton.__version__)
 
 if TRITON_VERSION >= version.parse('3.0.0'):
-
     fast_expf = tl.math.exp
 else:
-    tanh = tl.math.tanh
     fast_expf = tl.math.fast_expf
 
 
@@ -26,63 +24,29 @@ def _silu_and_mul_kernel(
     BLOCK_SIZE_N: tl.constexpr,
 ):
     """silu and mul kernel."""
-    m_id = tl.program_id(0)
+    n_block_id = tl.program_id(0)
+    m_id = tl.program_id(1)
 
     up_ptr = gateup_ptr + N * stride_gun
 
-    offs_n = tl.arange(0, BLOCK_SIZE_N)
+    offs_n = n_block_id * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
     gate_ptrs = gateup_ptr + m_id * stride_gum + offs_n * stride_gun
     up_ptrs = up_ptr + m_id * stride_gum + offs_n * stride_gun
     out_ptrs = out_ptr + m_id * stride_om + offs_n * stride_on
 
-    for _ in range(0, N, BLOCK_SIZE_N):
-        gate = tl.load(gate_ptrs).to(tl.float32)
-        up = tl.load(up_ptrs).to(tl.float32)
-
-        gate = gate / (1 + fast_expf(-gate))
-        out = gate * up
-
-        tl.store(out_ptrs, out)
-
-        gate_ptrs += BLOCK_SIZE_N * stride_gun
-        up_ptrs += BLOCK_SIZE_N * stride_gun
-        out_ptrs += BLOCK_SIZE_N * stride_on
-
-
-@triton.jit
-def _silu_and_mul_no_align_kernel(
-    gateup_ptr,
-    out_ptr,
-    N: tl.constexpr,
-    stride_gum: tl.constexpr,
-    stride_gun: tl.constexpr,
-    stride_om: tl.constexpr,
-    stride_on: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr,
-):
-    """silu and mul kernel."""
-    m_id = tl.program_id(0)
-
-    up_ptr = gateup_ptr + N * stride_gun
-
-    offs_n = tl.arange(0, BLOCK_SIZE_N)
-    gate_ptrs = gateup_ptr + m_id * stride_gum + offs_n * stride_gun
-    up_ptrs = up_ptr + m_id * stride_gum + offs_n * stride_gun
-    out_ptrs = out_ptr + m_id * stride_om + offs_n * stride_on
-
-    for n in range(0, N, BLOCK_SIZE_N):
-        mask = n + offs_n < N
-        gate = tl.load(gate_ptrs, mask=mask).to(tl.float32)
-        up = tl.load(up_ptrs, mask=mask).to(tl.float32)
-
-        gate = gate / (1 + fast_expf(-gate))
-        out = gate * up
+    if N % BLOCK_SIZE_N == 0:
+        mask = None
+    else:
+        mask = offs_n < N
+    gate = tl.load(gate_ptrs, mask=mask)
+    up = tl.load(up_ptrs, mask=mask)
+    gate = gate.to(tl.float32)
+    up = up.to(tl.float32)
 
-        tl.store(out_ptrs, out, mask=mask)
+    gate = gate / (1 + fast_expf(-gate))
+    out = gate * up
 
-        gate_ptrs += BLOCK_SIZE_N * stride_gun
-        up_ptrs += BLOCK_SIZE_N * stride_gun
-        out_ptrs += BLOCK_SIZE_N * stride_on
+    tl.store(out_ptrs, out, mask=mask)
 
 
 def silu_and_mul(gate_up: torch.Tensor, out: torch.Tensor = None):
@@ -96,31 +60,22 @@ def silu_and_mul(gate_up: torch.Tensor, out: torch.Tensor = None):
         out = gate_up.new_empty(out_shape)
 
     BLOCK_SIZE_N = triton.next_power_of_2(N)
-    BLOCK_SIZE_N = min(BLOCK_SIZE_N, 1024)
+    BLOCK_SIZE_N = min(BLOCK_SIZE_N, 512)
     num_warps = 4
-    num_stages = 2
-    grid = (M, )
-    if N % BLOCK_SIZE_N == 0:
-        _silu_and_mul_kernel[grid](gate_up,
-                                   out,
-                                   N,
-                                   stride_gum=gate_up.stride(0),
-                                   stride_gun=gate_up.stride(1),
-                                   stride_om=out.stride(0),
-                                   stride_on=out.stride(1),
-                                   BLOCK_SIZE_N=BLOCK_SIZE_N,
-                                   num_warps=num_warps,
-                                   num_stages=num_stages)
-    else:
-        _silu_and_mul_no_align_kernel[grid](gate_up,
-                                            out,
-                                            N,
-                                            stride_gum=gate_up.stride(0),
-                                            stride_gun=gate_up.stride(1),
-                                            stride_om=out.stride(0),
-                                            stride_on=out.stride(1),
-                                            BLOCK_SIZE_N=BLOCK_SIZE_N,
-                                            num_warps=num_warps,
-                                            num_stages=num_stages)
+    num_stages = 1
+    grid = (
+        triton.cdiv(N, BLOCK_SIZE_N),
+        M,
+    )
+    _silu_and_mul_kernel[grid](gate_up,
+                               out,
+                               N,
+                               stride_gum=gate_up.stride(0),
+                               stride_gun=gate_up.stride(1),
+                               stride_om=out.stride(0),
+                               stride_on=out.stride(1),
+                               BLOCK_SIZE_N=BLOCK_SIZE_N,
+                               num_warps=num_warps,
+                               num_stages=num_stages)
 
     return out
diff --git a/lmdeploy/pytorch/kernels/cuda/apply_rotary_pos_emb.py b/lmdeploy/pytorch/kernels/cuda/apply_rotary_pos_emb.py
index 9e14dc6a0c..f9d5f2f171 100644
--- a/lmdeploy/pytorch/kernels/cuda/apply_rotary_pos_emb.py
+++ b/lmdeploy/pytorch/kernels/cuda/apply_rotary_pos_emb.py
@@ -4,35 +4,9 @@
 import triton.language as tl
 from torch import Tensor
 
-from .triton_utils import get_kernel_meta, wrap_jit_func
-
-
-@wrap_jit_func(type_hint=dict(
-    Q=Tensor,
-    K=Tensor,
-    COS=Tensor,
-    SIN=Tensor,
-    POS=Tensor,
-    Q_EMB=Tensor,
-    K_EMB=Tensor,
-    seq_len=int,
-    stride_qs=int,
-    stride_qh=int,
-    stride_qd=int,
-    stride_ks=int,
-    stride_kh=int,
-    stride_kd=int,
-    stride_qes=int,
-    stride_qeh=int,
-    stride_qed=int,
-    stride_kes=int,
-    stride_keh=int,
-    stride_ked=int,
-    half_size=torch.int32,
-    BLOCK=torch.int32,
-    BLOCK_QH=torch.int32,
-    BLOCK_N=torch.int32,
-))
+from .triton_utils import get_kernel_meta
+
+
 @triton.jit(do_not_specialize=('seq_len', ))
 def apply_rotary_pos_emb_qk_kernel(
     Q,
@@ -60,8 +34,8 @@ def apply_rotary_pos_emb_qk_kernel(
     BLOCK_N: tl.constexpr,
 ):
     """apply rotary on key AND query kernel."""
-    seq_block_id = tl.program_id(0)
-    head_id = tl.program_id(1)
+    seq_block_id = tl.program_id(1)
+    head_id = tl.program_id(0)
 
     pos_offset = seq_block_id * BLOCK + tl.arange(0, BLOCK)
     pos_mask = pos_offset < seq_len
@@ -158,10 +132,13 @@ def apply_rotary_pos_emb(q: Tensor,
     num_heads_q = q.size(-2)
     num_heads_k = k.size(-2)
     num_warps = 4
-    num_stages = 4
+    num_stages = 1
 
     kernel_meta = get_kernel_meta(q)
-    grid = [triton.cdiv(seq_len, BLOCK), num_heads_q + num_heads_k]
+    grid = [
+        num_heads_q + num_heads_k,
+        triton.cdiv(seq_len, BLOCK),
+    ]
     apply_rotary_pos_emb_qk_kernel[grid](q,
                                          k,
                                          cos,
diff --git a/lmdeploy/pytorch/kernels/cuda/fill_kv_cache.py b/lmdeploy/pytorch/kernels/cuda/fill_kv_cache.py
index 9ef614fadd..93bd89f488 100644
--- a/lmdeploy/pytorch/kernels/cuda/fill_kv_cache.py
+++ b/lmdeploy/pytorch/kernels/cuda/fill_kv_cache.py
@@ -1,12 +1,11 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from typing import Literal
 
-import torch
 import triton
 import triton.language as tl
 from torch import Tensor
 
-from .triton_utils import get_kernel_meta, wrap_jit_func
+from .triton_utils import get_kernel_meta
 
 
 @triton.jit
@@ -38,37 +37,6 @@ def _quant_int4(val1, val2):
     return q_val, scales, zeros
 
 
-@wrap_jit_func(type_hint=dict(
-    KStates=Tensor,
-    VStates=Tensor,
-    KCaches=Tensor,
-    VCaches=Tensor,
-    QStartLoc=Tensor,
-    QSeqLens=Tensor,
-    KVSeqLens=Tensor,
-    BlockOffsets=Tensor,
-    num_heads=torch.int32,
-    head_dim=torch.int32,
-    stride_kss=int,
-    stride_ksh=int,
-    stride_ksd=int,
-    stride_vss=int,
-    stride_vsh=int,
-    stride_vsd=int,
-    stride_kcn=int,
-    stride_kcb=int,
-    stride_kch=int,
-    stride_kcd=int,
-    stride_vcn=int,
-    stride_vcb=int,
-    stride_vch=int,
-    stride_vcd=int,
-    stride_boff=int,
-    BLOCK=torch.int32,
-    BLOCK_D=torch.int32,
-    BLOCK_DV=torch.int32,
-    BLOCK_H=torch.int32,
-))
 @triton.jit
 def _fill_kv_cache_kernel(
     KStates,
@@ -79,7 +47,7 @@ def _fill_kv_cache_kernel(
     QSeqLens,
     KVSeqLens,
     BlockOffsets,
-    num_heads: tl.constexpr,
+    is_decoding: tl.constexpr,
     head_dim: tl.constexpr,
     head_dim_v: tl.constexpr,
     stride_kss,
@@ -100,108 +68,70 @@ def _fill_kv_cache_kernel(
     BLOCK: tl.constexpr,
     BLOCK_D: tl.constexpr,
     BLOCK_DV: tl.constexpr,
-    BLOCK_H: tl.constexpr,
 ):
     """fill kv cache kernel."""
-    batch_id = tl.program_id(0)
+    batch_id = tl.program_id(2)
+    head_id = tl.program_id(0)
     block_id = tl.program_id(1)
 
-    # initialize
-    h_off = tl.arange(0, BLOCK_H)
-    d_off = tl.arange(0, BLOCK_D)
-
     q_startloc = tl.load(QStartLoc + batch_id)
     q_seqlen = tl.load(QSeqLens + batch_id)
     kv_seqlen = tl.load(KVSeqLens + batch_id)
     history_seqlen = kv_seqlen - q_seqlen
 
-    block0_first_tokenloc = history_seqlen % BLOCK
-
-    state_token_offset = tl.maximum(block_id * BLOCK - block0_first_tokenloc,
-                                    0)
-    kv_block_id = _div_up(history_seqlen + 1, BLOCK) - 1 + block_id
-    kv_block_id = min(kv_block_id, stride_boff - 1)
-    block_off = tl.load(BlockOffsets + batch_id * stride_boff + kv_block_id)
+    kv_block_id = history_seqlen // BLOCK + block_id
 
-    cur_startloc = q_startloc + state_token_offset
-    ks_ptr = KStates + cur_startloc * stride_kss
-    vs_ptr = VStates + cur_startloc * stride_vss
+    if kv_seqlen <= 0:
+        return
 
-    kc_ptr = KCaches + block_off * stride_kcn
-    vc_ptr = VCaches + block_off * stride_vcn
+    if kv_block_id * BLOCK >= kv_seqlen:
+        return
 
-    c_first_tokenloc = block0_first_tokenloc
-    if block_id != 0:
-        c_first_tokenloc *= 0
-    c_last_tokenloc = tl.minimum(
-        BLOCK, q_seqlen + block0_first_tokenloc - block_id * BLOCK)
+    if is_decoding:
+        page_offs = tl.full((1, ), history_seqlen % BLOCK, dtype=tl.int32)
+        kv_mask = tl.full((1, ), 1, dtype=tl.int1)
+        q_offs = tl.full((1, ), q_startloc, dtype=tl.int32)
+    else:
+        page_offs = tl.arange(0, BLOCK)
+        kv_offs = kv_block_id * BLOCK + page_offs
+        kv_mask = (kv_offs >= history_seqlen) & (kv_offs < kv_seqlen)
+        token_off = q_startloc + kv_block_id * BLOCK - history_seqlen
+        q_offs = token_off + page_offs
 
-    for bidx in range(c_first_tokenloc, c_last_tokenloc):
-        sidx = bidx - c_first_tokenloc
-        mask = (h_off[:, None] < num_heads) & (d_off[None, :] < head_dim)
-        k = tl.load(ks_ptr + sidx * stride_kss + h_off[:, None] * stride_ksh +
-                    d_off[None, :] * stride_ksd,
-                    mask=mask)
-        tl.store(kc_ptr + bidx * stride_kcb + h_off[:, None] * stride_kch +
-                 d_off[None, :] * stride_kcd,
-                 k,
-                 mask=mask)
+    block_off = tl.load(BlockOffsets + batch_id * stride_boff + kv_block_id)
 
-        if BLOCK_DV > 0:
-            dv_off = tl.arange(0, BLOCK_DV)
-            maskv = (h_off[:, None] < num_heads) & (dv_off[None, :] <
-                                                    head_dim_v)
-            v = tl.load(vs_ptr + sidx * stride_vss +
-                        h_off[:, None] * stride_vsh +
-                        dv_off[None, :] * stride_vsd,
-                        mask=maskv)
-            tl.store(vc_ptr + bidx * stride_vcb + h_off[:, None] * stride_vch +
-                     dv_off[None, :] * stride_vcd,
-                     v,
-                     mask=maskv)
+    d_off = tl.arange(0, BLOCK_D)
+    mask_ks = kv_mask[:, None]
+    mask_kc = mask_ks & (d_off[None, :] < head_dim)
+    d_off = d_off % head_dim
+
+    ks_ptr = KStates + head_id * stride_ksh
+    ks_ptrs = ks_ptr + q_offs[:,
+                              None] * stride_kss + d_off[None, :] * stride_ksd
+    kc_ptr = KCaches + block_off * stride_kcn + head_id * stride_kch
+    kc_ptrs = kc_ptr + page_offs[:, None] * stride_kcb + d_off[
+        None, :] * stride_kcd
+
+    if BLOCK_DV > 0:
+        dv_off = tl.arange(0, BLOCK_DV)
+        mask_vs = kv_mask[:, None]
+        mask_vc = mask_vs & (dv_off[None, :] < head_dim_v)
+        dv_off = dv_off % head_dim_v
+        vs_ptr = VStates + head_id * stride_vsh
+        vs_ptrs = vs_ptr + q_offs[:, None] * stride_vss + dv_off[
+            None, :] * stride_vsd
+        vc_ptr = VCaches + block_off * stride_vcn + head_id * stride_vch
+        vc_ptrs = vc_ptr + page_offs[:, None] * stride_vcb + dv_off[
+            None, :] * stride_vcd
+
+    k = tl.load(ks_ptrs, mask=mask_ks)
+    if BLOCK_DV > 0:
+        v = tl.load(vs_ptrs, mask=mask_vs)
+    tl.store(kc_ptrs, k, mask=mask_kc)
+    if BLOCK_DV > 0:
+        tl.store(vc_ptrs, v, mask=mask_vc)
 
 
-@wrap_jit_func(type_hint=dict(
-    KStates=Tensor,
-    VStates=Tensor,
-    KCaches=Tensor,
-    VCaches=Tensor,
-    KScalesZeros=Tensor,
-    VScalesZeros=Tensor,
-    QStartLoc=Tensor,
-    QSeqLens=Tensor,
-    KVSeqLens=Tensor,
-    BlockOffsets=Tensor,
-    num_heads=torch.int32,
-    head_dim=torch.int32,
-    stride_kss=int,
-    stride_ksh=int,
-    stride_ksd=int,
-    stride_vss=int,
-    stride_vsh=int,
-    stride_vsd=int,
-    stride_kcn=int,
-    stride_kcb=int,
-    stride_kch=int,
-    stride_kcd=int,
-    stride_vcn=int,
-    stride_vcb=int,
-    stride_vch=int,
-    stride_vcd=int,
-    stride_kszn=int,
-    stride_kszb=int,
-    stride_kszh=int,
-    stride_kszd=int,
-    stride_vszn=int,
-    stride_vszb=int,
-    stride_vszh=int,
-    stride_vszd=int,
-    stride_boff=int,
-    BLOCK=torch.int32,
-    BLOCK_D=torch.int32,
-    BLOCK_DV=torch.int32,
-    BLOCK_H=torch.int32,
-))
 @triton.jit
 def _fill_kv_cache_quant_kernel(
     KStates,
@@ -394,15 +324,19 @@ def fill_kv_cache(k_states: Tensor,
     num_heads = k_caches.size(h_dim)
     head_dim = k_caches.size(d_dim)
     head_dim_v = v_states.size(-1)
-    max_num_blocks = triton.cdiv(max_q_seq_length, block_size) + 1
+    if max_q_seq_length == 1:
+        max_num_blocks = 1
+    else:
+        max_num_blocks = triton.cdiv(max_q_seq_length, block_size) + 1
 
     BLOCK = block_size
     BLOCK_H = triton.next_power_of_2(num_heads)
     BLOCK_D = triton.next_power_of_2(head_dim)
     BLOCK_DV = triton.next_power_of_2(head_dim_v)
-    grid = [batch_size, max_num_blocks]
     kernel_meta = get_kernel_meta(k_states)
     if quant_policy == 0:
+        grid = [num_heads, max_num_blocks, batch_size]
+        is_decoding = max_num_blocks == 1
         _fill_kv_cache_kernel[grid](
             k_states,
             v_states,
@@ -412,7 +346,7 @@ def fill_kv_cache(k_states: Tensor,
             q_seq_length,
             kv_seq_length,
             block_offsets,
-            num_heads=num_heads,
+            is_decoding=is_decoding,
             head_dim=head_dim,
             head_dim_v=head_dim_v,
             stride_kss=k_states.stride(-3),
@@ -433,12 +367,12 @@ def fill_kv_cache(k_states: Tensor,
             BLOCK=BLOCK,
             BLOCK_D=BLOCK_D,
             BLOCK_DV=BLOCK_DV,
-            BLOCK_H=BLOCK_H,
             num_warps=4,
             num_stages=3,
             **kernel_meta,
         )
     else:
+        grid = [batch_size, max_num_blocks]
         _fill_kv_cache_quant_kernel[grid](
             k_states,
             v_states,
diff --git a/lmdeploy/pytorch/models/utils/cudagraph.py b/lmdeploy/pytorch/models/utils/cudagraph.py
index 149376e4be..74d090a9a3 100644
--- a/lmdeploy/pytorch/models/utils/cudagraph.py
+++ b/lmdeploy/pytorch/models/utils/cudagraph.py
@@ -70,15 +70,14 @@ def make_buffers_cudagraph(self, graph_meta: CudaGraphMeta, *args,
         input_buffers['block_offsets'] = torch.zeros((max_batches, num_blocks),
                                                      dtype=torch.int64,
                                                      device=device)
-        input_buffers['q_start_loc'] = torch.zeros(max_batches,
-                                                   dtype=torch.int64,
-                                                   device=device)
-        input_buffers['q_seqlens'] = torch.zeros(max_batches,
-                                                 dtype=torch.int64,
-                                                 device=device)
-        input_buffers['kv_seqlens'] = torch.zeros(max_batches,
-                                                  dtype=torch.int64,
-                                                  device=device)
+
+        input_buffers['qkv_lens'] = torch.zeros(3,
+                                                max_batches,
+                                                dtype=torch.int64,
+                                                device=device)
+        input_buffers['q_start_loc'] = input_buffers['qkv_lens'][0]
+        input_buffers['q_seqlens'] = input_buffers['qkv_lens'][1]
+        input_buffers['kv_seqlens'] = input_buffers['qkv_lens'][2]
         input_buffers['local_adapter_ids'] = torch.zeros(max_batches,
                                                          dtype=torch.int64,
                                                          device=device)
@@ -111,13 +110,10 @@ def fill_buffers_cudagraph(self, graph_meta: CudaGraphMeta,
         input_buffers['position_ids'][:, :num_tokens] = position_ids
         input_buffers[
             'block_offsets'][:batch_size, :num_blocks] = block_offsets
-        if q_seqlens.data_ptr() != input_buffers['q_seqlens'].data_ptr():
-            input_buffers['q_seqlens'].zero_()
-        input_buffers['q_seqlens'][:batch_size] = q_seqlens
-        if kv_seqlens.data_ptr() != input_buffers['kv_seqlens'].data_ptr():
-            input_buffers['kv_seqlens'].zero_()
-        input_buffers['kv_seqlens'][:batch_size] = kv_seqlens
-        input_buffers['q_start_loc'][:batch_size] = q_start_loc
+
+        qkv = torch.stack((q_start_loc, q_seqlens, kv_seqlens))
+        input_buffers['qkv_lens'].zero_()
+        input_buffers['qkv_lens'][:, :batch_size] = qkv
         if inputs_embeds is not None:
             emb_size = inputs_embeds.size(-1)
             if 'inputs_embeds' not in input_buffers:
diff --git a/tests/pytorch/engine/test_logits_process.py b/tests/pytorch/engine/test_logits_process.py
index 5c5fdbdc18..69c8315411 100644
--- a/tests/pytorch/engine/test_logits_process.py
+++ b/tests/pytorch/engine/test_logits_process.py
@@ -35,8 +35,9 @@ def test_process_bad_words():
         [4, 4],
         [-1, -1],
     ])
+    mask = bad_words >= 0
 
-    out_scores = _process_bad_words_(scores, bad_words)
+    out_scores = _process_bad_words_(scores, bad_words.where(mask, 0), mask)
 
     for score, bw in zip(out_scores, bad_words):
         bw = bw.tolist()

From 0dedd73e5727776e2392b6f7256e0f66d0c48c8b Mon Sep 17 00:00:00 2001
From: q yao <streetyao@live.com>
Date: Tue, 3 Dec 2024 14:44:22 +0800
Subject: [PATCH 35/40] fix the logic to verify whether AutoAWQ has been
 successfully installed (#2844)

---
 lmdeploy/pytorch/backends/cuda/awq_modules.py | 2 --
 lmdeploy/pytorch/backends/cuda/op_backend.py  | 6 +++++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/lmdeploy/pytorch/backends/cuda/awq_modules.py b/lmdeploy/pytorch/backends/cuda/awq_modules.py
index f3cbf8bee4..8159bbf554 100644
--- a/lmdeploy/pytorch/backends/cuda/awq_modules.py
+++ b/lmdeploy/pytorch/backends/cuda/awq_modules.py
@@ -53,8 +53,6 @@ class AwqLinearW4A16Impl(LinearW4A16Impl):
 
     def __init__(self, in_features: int, out_features: int, w_bit: int,
                  group_size: int):
-        from awq.modules.linear.gemm import AWQ_INSTALLED
-        assert AWQ_INSTALLED
         self.in_features = in_features
         self.out_features = out_features
         self.w_bit = w_bit
diff --git a/lmdeploy/pytorch/backends/cuda/op_backend.py b/lmdeploy/pytorch/backends/cuda/op_backend.py
index 3e7fc23728..d796f8e19f 100644
--- a/lmdeploy/pytorch/backends/cuda/op_backend.py
+++ b/lmdeploy/pytorch/backends/cuda/op_backend.py
@@ -48,7 +48,11 @@ def get_layer_impl_builder(cls, layer_type: OpType):
             from .activation import TritonSiluAndMulBuilder
             return TritonSiluAndMulBuilder
         elif layer_type == OpType.LinearW4A16:
-            from awq.modules.linear.gemm import AWQ_INSTALLED
+            try:
+                from awq.modules.linear.gemm import awq_ext  # noqa: F401
+                AWQ_INSTALLED = True
+            except Exception:
+                AWQ_INSTALLED = False
             if AWQ_INSTALLED:
                 from .awq_modules import AwqLinearW4A16Builder
                 return AwqLinearW4A16Builder

From efa8ac032005091a17f6c4555917d400c44486ba Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Tue, 3 Dec 2024 14:45:57 +0800
Subject: [PATCH 36/40] check whether backend_config is None or not before
 accessing its attr (#2848)

---
 lmdeploy/api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lmdeploy/api.py b/lmdeploy/api.py
index 2b4204a53b..42b7c6e4c1 100644
--- a/lmdeploy/api.py
+++ b/lmdeploy/api.py
@@ -71,7 +71,7 @@ def pipeline(model_path: str,
 
     task, pipeline_class = get_task(model_path)
     if task == 'vlm':
-        if backend_config.enable_prefix_caching:
+        if backend_config and backend_config.enable_prefix_caching:
             backend_config.enable_prefix_caching = False
             logger.warning('VLM does not support prefix caching.')
 

From a6645b228674a294eed00a73c508a00081c3dc6e Mon Sep 17 00:00:00 2001
From: zhoushenglong <87467364+Reinerzhou@users.noreply.github.com>
Date: Tue, 3 Dec 2024 16:44:35 +0800
Subject: [PATCH 37/40] [dlinfer] change dlinfer kv_cache layout and ajust
 paged_prefill_attention api. (#2847)

* opt update_step_ctx on maca.

* change kv_cache layout and ajust paged_prefill_attention.

* opt maca update context.
---
 .../backends/dlinfer/maca/op_backend.py       | 80 +++++++++----------
 .../pytorch/kernels/dlinfer/pagedattention.py |  3 +
 2 files changed, 41 insertions(+), 42 deletions(-)

diff --git a/lmdeploy/pytorch/backends/dlinfer/maca/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/maca/op_backend.py
index 084cae1bfe..a68ed9ac3a 100644
--- a/lmdeploy/pytorch/backends/dlinfer/maca/op_backend.py
+++ b/lmdeploy/pytorch/backends/dlinfer/maca/op_backend.py
@@ -12,6 +12,7 @@
 
 class MacaOpsBackend(DlinferOpsBackend):
     """maca layer backend."""
+    total_slots = None
 
     @staticmethod
     def get_name() -> str:
@@ -25,10 +26,8 @@ def get_k_block_shape(
         head_size: int,
         dtype: torch.dtype,
     ) -> Tuple[int, ...]:
-        if head_size == 576:
-            x = 16
-            return (num_heads, head_size // x, block_size, x)
-        return (num_heads, block_size, head_size)
+        x = 16
+        return (num_heads, head_size // x, block_size, x)
 
     @staticmethod
     def get_v_block_shape(
@@ -42,11 +41,25 @@ def get_v_block_shape(
     @classmethod
     def update_step_context(cls, step_context):
         """update step context."""
+
+        def get_total_slots():
+            if cls.total_slots is None:
+                cls.total_slots = torch.arange(
+                    block_num * block_size,
+                    dtype=torch.long,
+                    device=step_context.block_offsets.device)
+                cls.total_slots = cls.total_slots.view(block_num, block_size)
+            return cls.total_slots
+
         kv_start_indices, attention_mask = [], []
-        block_num, _, block_size, _ = step_context.kv_caches[0][0].shape
+        block_num, _, block_size, _ = step_context.kv_caches[0][1].shape
         device = step_context.block_offsets.device
 
         is_unpaged_prefill = False
+        if not step_context.is_decoding:
+            is_unpaged_prefill = \
+               all((step_context.q_seqlens ==
+                    step_context.kv_seqlens).tolist())
         q_start_loc = torch.cat((torch.tensor([0], device=device),
                                  step_context.q_seqlens.cumsum(0))).int()
         q_seqlens = step_context.q_seqlens.int()
@@ -54,43 +67,26 @@ def update_step_context(cls, step_context):
         max_q_seq_len = torch.max(q_seqlens).item()
         max_kv_seq_len = torch.max(kv_seqlens).item()
 
-        if not step_context.is_decoding:
-            is_unpaged_prefill = \
-                all((step_context.q_seqlens ==
-                     step_context.kv_seqlens).tolist())
-            if is_unpaged_prefill:
-                single_attention_mask = torch.logical_not(
-                    torch.tril(
-                        torch.ones(max_q_seq_len,
-                                   max_kv_seq_len,
-                                   dtype=torch.bool).cuda(),
-                        diagonal=max_kv_seq_len - max_q_seq_len,
-                    ))
-                attention_mask.append(single_attention_mask)
-        total_slots = torch.arange(block_num * block_size,
-                                   dtype=torch.long,
-                                   device=device)
-        total_slots = total_slots.view(block_num, block_size)
-        for i in range(step_context.q_start_loc.size(0)):
-            q_seq_len = int(step_context.q_seqlens[i])
-            kv_seq_len = int(step_context.kv_seqlens[i])
-            if not (step_context.is_decoding or is_unpaged_prefill):
-                single_attention_mask = torch.logical_not(
-                    torch.tril(
-                        torch.ones(step_context.q_seqlens[i],
-                                   step_context.block_offsets.shape[1] *
-                                   block_size,
-                                   dtype=torch.bool).cuda(),
-                        diagonal=step_context.kv_seqlens[i] -
-                        step_context.q_seqlens[i],
-                    ))
-                attention_mask.append(single_attention_mask)
-            history_length = kv_seq_len - q_seq_len
-            slot_tables = total_slots[step_context.block_offsets[i]].flatten()
-            slot_indices = [p for p in range(history_length, kv_seq_len)]
-            slots = slot_tables[slot_indices].reshape((-1, 1))
-            kv_start_indices.append(slots)
-        kv_start_indices = torch.cat(kv_start_indices)
+        if step_context.is_decoding:
+            # collect kv_start_indices without using a for-loop,
+            # (fill kv-cache for just ONE token during the decoding phase)
+            idx = (step_context.kv_seqlens - 1) % block_size
+            b_num = (step_context.kv_seqlens - 1) // block_size
+            last_block = step_context.block_offsets.gather(
+                1, b_num.view(-1, 1)).view(-1)
+            kv_start_indices = (last_block * block_size + idx).reshape((-1, 1))
+        else:
+            for i in range(step_context.q_start_loc.size(0)):
+                q_seq_len = int(step_context.q_seqlens[i])
+                kv_seq_len = int(step_context.kv_seqlens[i])
+                # collect kv start indices during the prefill phase.
+                history_length = kv_seq_len - q_seq_len
+                total_slots = get_total_slots()
+                slot_tables = total_slots[step_context.block_offsets[i]].view(
+                    -1)
+                slots = slot_tables[history_length:kv_seq_len]
+                kv_start_indices.append(slots)
+            kv_start_indices = torch.cat(kv_start_indices)
 
         attn_meta_cls = cls.get_attention_metadata_cls()
         attn_metadata = attn_meta_cls(
diff --git a/lmdeploy/pytorch/kernels/dlinfer/pagedattention.py b/lmdeploy/pytorch/kernels/dlinfer/pagedattention.py
index 21c72074a4..47bcb0cfff 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/pagedattention.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/pagedattention.py
@@ -39,6 +39,8 @@ def prefill_attention(
     else:
         return ext_ops.paged_prefill_attention(
             query_states,
+            key_states,
+            value_states,
             key_cache,
             value_cache,
             block_offsets,
@@ -46,6 +48,7 @@ def prefill_attention(
             q_start_loc,
             q_seq_len,
             kv_seq_len,
+            max_q_seq_len,
             num_q_heads,
             num_kv_heads,
             attn_mask,

From cc8cfb0b456cb05cf1089ff1f27706d395e70864 Mon Sep 17 00:00:00 2001
From: zhoushenglong <87467364+Reinerzhou@users.noreply.github.com>
Date: Tue, 3 Dec 2024 16:45:35 +0800
Subject: [PATCH 38/40] [maca] add env to support different mm layout on maca.
 (#2835)

* add env to support different mm layout on maca.

* rename env variable.
---
 lmdeploy/pytorch/backends/dlinfer/linear.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/lmdeploy/pytorch/backends/dlinfer/linear.py b/lmdeploy/pytorch/backends/dlinfer/linear.py
index 567a01dddf..5edfa7728d 100644
--- a/lmdeploy/pytorch/backends/dlinfer/linear.py
+++ b/lmdeploy/pytorch/backends/dlinfer/linear.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import os
 from typing import Optional
 
 import torch
@@ -11,6 +12,14 @@
 class DlinferLinearImpl(LinearImpl):
     """Dlinfer linear implementation api."""
 
+    def update_weights(self,
+                       weight: torch.Tensor,
+                       bias: Optional[torch.Tensor] = None):
+        """update weights."""
+        if os.getenv('DLINER_LINEAR_USE_NN_LAYOUT', '0') == '1':
+            weight = weight.data.t().contiguous()
+        return weight, bias
+
     def forward(self,
                 x,
                 weight: torch.Tensor,

From 69a4306d2cd7d8f7790e39edfb3b8266282767fa Mon Sep 17 00:00:00 2001
From: AllentDan <41138331+AllentDan@users.noreply.github.com>
Date: Wed, 4 Dec 2024 13:39:05 +0800
Subject: [PATCH 39/40] Supports W8A8 quantization for more models (#2850)

* Supports W8A8 quantization for more models

* update supported models
---
 docs/en/supported_models/supported_models.md  |   48 +-
 .../supported_models/supported_models.md      |   48 +-
 lmdeploy/lite/apis/calibrate.py               |    3 +
 lmdeploy/lite/apis/smooth_quant.py            |   66 +-
 lmdeploy/lite/quantization/awq.py             |   11 +-
 lmdeploy/pytorch/modeling/__init__.py         |    1 -
 .../pytorch/modeling/convert_to_qmodules.py   |   59 -
 .../pytorch/modeling/modeling_baichuan.py     |  824 -------
 .../pytorch/modeling/modeling_internlm.py     | 1171 ----------
 .../pytorch/modeling/modeling_internlm2.py    | 1940 -----------------
 lmdeploy/pytorch/modeling/modeling_llama.py   | 1297 -----------
 11 files changed, 63 insertions(+), 5405 deletions(-)
 delete mode 100644 lmdeploy/pytorch/modeling/__init__.py
 delete mode 100644 lmdeploy/pytorch/modeling/convert_to_qmodules.py
 delete mode 100644 lmdeploy/pytorch/modeling/modeling_baichuan.py
 delete mode 100644 lmdeploy/pytorch/modeling/modeling_internlm.py
 delete mode 100644 lmdeploy/pytorch/modeling/modeling_internlm2.py
 delete mode 100644 lmdeploy/pytorch/modeling/modeling_llama.py

diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
index cd43e79c94..469ece487f 100644
--- a/docs/en/supported_models/supported_models.md
+++ b/docs/en/supported_models/supported_models.md
@@ -51,47 +51,47 @@ The TurboMind engine doesn't support window attention. Therefore, for models tha
 |     Llama      |  7B - 65B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |     Llama2     |  7B - 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |     Llama3     |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
-|    Llama3.1    |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|    Llama3.2    |   1B, 3B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|  Llama3.2-VL   |  11B, 90B   | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|    InternLM    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |   -   |
+|    Llama3.1    |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|    Llama3.2    |   1B, 3B    | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|  Llama3.2-VL   |  11B, 90B   | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+|    InternLM    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |   InternLM2    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |  InternLM2.5   |     7B      | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |   Baichuan2    |     7B      | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  No   |
 |   Baichuan2    |     13B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |    ChatGLM2    |     6B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |     Falcon     |  7B - 180B  | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-|       YI       |  6B - 34B   | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
-|    Mistral     |     7B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|       YI       |  6B - 34B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|    Mistral     |     7B      | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |    Mixtral     | 8x7B, 8x22B | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-|      QWen      | 1.8B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
-|    QWen1.5     | 0.5B - 110B | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
+|      QWen      | 1.8B - 72B  | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|    QWen1.5     | 0.5B - 110B | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |  QWen1.5-MoE   |    A2.7B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-|     QWen2      | 0.5B - 72B  | LLM  |    Yes    |   Yes   |   No    |  No  |  Yes  |
+|     QWen2      | 0.5B - 72B  | LLM  |    Yes    |   Yes   |   No    | Yes  |  Yes  |
 |    QWen2-VL    |   2B, 7B    | MLLM |    Yes    |   Yes   |   No    |  No  |  No   |
 |  DeepSeek-MoE  |     16B     | LLM  |    Yes    |   No    |   No    |  No  |  No   |
 |  DeepSeek-V2   |  16B, 236B  | LLM  |    Yes    |   No    |   No    |  No  |  No   |
 |    MiniCPM3    |     4B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-| MiniCPM-V-2_6  |     8B      | LLM  |    Yes    |   No    |   No    |  No  |  Yes  |
+| MiniCPM-V-2_6  |     8B      | LLM  |    Yes    |   No    |   No    | Yes  |  Yes  |
 |     Gemma      |    2B-7B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |      Dbrx      |    132B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |   StarCoder2   |   3B-15B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-|   Phi-3-mini   |    3.8B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
-|  Phi-3-vision  |    4.2B     | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|  CogVLM-Chat   |     17B     | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|  CogVLM2-Chat  |     19B     | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
-| LLaVA(1.5,1.6) |   7B-34B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
-| InternVL(v1.5) |   2B-26B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
-|   InternVL2    |   1B-40B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
-| Mono-InternVL  |     2B      | MLLM |   Yes\*   |   Yes   |   Yes   |  No  |   -   |
-|    ChemVLM     |   8B-26B    | MLLM |    Yes    |   Yes   |   No    |  No  |   -   |
-|     Gemma2     |   9B-27B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|   Phi-3-mini   |    3.8B     | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|  Phi-3-vision  |    4.2B     | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+|  CogVLM-Chat   |     17B     | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+|  CogVLM2-Chat  |     19B     | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+| LLaVA(1.5,1.6) |   7B-34B    | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+| InternVL(v1.5) |   2B-26B    | MLLM |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|   InternVL2    |   1B-40B    | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+| Mono-InternVL  |     2B      | MLLM |   Yes\*   |   Yes   |   Yes   |  -   |   -   |
+|    ChemVLM     |   8B-26B    | MLLM |    Yes    |   Yes   |   No    |  -   |   -   |
+|     Gemma2     |   9B-27B    | LLM  |    Yes    |   Yes   |   Yes   |  -   |   -   |
 |      GLM4      |     9B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |     GLM-4V     |     9B      | MLLM |    Yes    |   Yes   |   Yes   |  No  |  No   |
-|   CodeGeeX4    |     9B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|  Phi-3.5-mini  |    3.8B     | LLM  |    Yes    |   Yes   |   No    |  No  |   -   |
-|  Phi-3.5-MoE   |   16x3.8B   | LLM  |    Yes    |   Yes   |   No    |  No  |   -   |
-| Phi-3.5-vision |    4.2B     | MLLM |    Yes    |   Yes   |   No    |  No  |   -   |
+|   CodeGeeX4    |     9B      | LLM  |    Yes    |   Yes   |   Yes   |  -   |   -   |
+|  Phi-3.5-mini  |    3.8B     | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |
+|  Phi-3.5-MoE   |   16x3.8B   | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |
+| Phi-3.5-vision |    4.2B     | MLLM |    Yes    |   Yes   |   No    |  -   |   -   |
 
 ```{note}
 * Currently Mono-InternVL does not support FP16 due to numerical instability. Please use BF16 instead.
diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
index 7ec36d2351..d734523282 100644
--- a/docs/zh_cn/supported_models/supported_models.md
+++ b/docs/zh_cn/supported_models/supported_models.md
@@ -51,47 +51,47 @@ turbomind 引擎不支持 window attention。所以，对于应用了 window att
 |     Llama      |  7B - 65B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |     Llama2     |  7B - 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |     Llama3     |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
-|    Llama3.1    |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|    Llama3.2    |   1B, 3B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|  Llama3.2-VL   |  11B, 90B   | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|    InternLM    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |   -   |
+|    Llama3.1    |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|    Llama3.2    |   1B, 3B    | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|  Llama3.2-VL   |  11B, 90B   | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+|    InternLM    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |   InternLM2    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |  InternLM2.5   |     7B      | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |   Baichuan2    |     7B      | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  No   |
 |   Baichuan2    |     13B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |    ChatGLM2    |     6B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |     Falcon     |  7B - 180B  | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-|       YI       |  6B - 34B   | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
-|    Mistral     |     7B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|       YI       |  6B - 34B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|    Mistral     |     7B      | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |    Mixtral     | 8x7B, 8x22B | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-|      QWen      | 1.8B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
-|    QWen1.5     | 0.5B - 110B | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
+|      QWen      | 1.8B - 72B  | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|    QWen1.5     | 0.5B - 110B | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |  QWen1.5-MoE   |    A2.7B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-|     QWen2      | 0.5B - 72B  | LLM  |    Yes    |   Yes   |   No    |  No  |  Yes  |
+|     QWen2      | 0.5B - 72B  | LLM  |    Yes    |   Yes   |   No    | Yes  |  Yes  |
 |    QWen2-VL    |   2B, 7B    | MLLM |    Yes    |   Yes   |   No    |  No  |  No   |
 |  DeepSeek-MoE  |     16B     | LLM  |    Yes    |   No    |   No    |  No  |  No   |
 |  DeepSeek-V2   |  16B, 236B  | LLM  |    Yes    |   No    |   No    |  No  |  No   |
 |    MiniCPM3    |     4B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-| MiniCPM-V-2_6  |     8B      | LLM  |    Yes    |   No    |   No    |  No  |  Yes  |
+| MiniCPM-V-2_6  |     8B      | LLM  |    Yes    |   No    |   No    | Yes  |  Yes  |
 |     Gemma      |    2B-7B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |      Dbrx      |    132B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |   StarCoder2   |   3B-15B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-|   Phi-3-mini   |    3.8B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
-|  Phi-3-vision  |    4.2B     | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|  CogVLM-Chat   |     17B     | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|  CogVLM2-Chat  |     19B     | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
-| LLaVA(1.5,1.6) |   7B-34B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
-| InternVL(v1.5) |   2B-26B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
-|   InternVL2    |   1B-40B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
-| Mono-InternVL  |     2B      | MLLM |   Yes\*   |   Yes   |   Yes   |  No  |   -   |
-|    ChemVLM     |   8B-26B    | MLLM |    Yes    |   Yes   |   No    |  No  |   -   |
-|     Gemma2     |   9B-27B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|   Phi-3-mini   |    3.8B     | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|  Phi-3-vision  |    4.2B     | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+|  CogVLM-Chat   |     17B     | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+|  CogVLM2-Chat  |     19B     | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+| LLaVA(1.5,1.6) |   7B-34B    | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+| InternVL(v1.5) |   2B-26B    | MLLM |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|   InternVL2    |   1B-40B    | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+| Mono-InternVL  |     2B      | MLLM |   Yes\*   |   Yes   |   Yes   |  -   |   -   |
+|    ChemVLM     |   8B-26B    | MLLM |    Yes    |   Yes   |   No    |  -   |   -   |
+|     Gemma2     |   9B-27B    | LLM  |    Yes    |   Yes   |   Yes   |  -   |   -   |
 |      GLM4      |     9B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |     GLM-4V     |     9B      | MLLM |    Yes    |   Yes   |   Yes   |  No  |  No   |
-|   CodeGeeX4    |     9B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|  Phi-3.5-mini  |    3.8B     | LLM  |    Yes    |   Yes   |   No    |  No  |   -   |
-|  Phi-3.5-MoE   |   16x3.8B   | LLM  |    Yes    |   Yes   |   No    |  No  |   -   |
-| Phi-3.5-vision |    4.2B     | MLLM |    Yes    |   Yes   |   No    |  No  |   -   |
+|   CodeGeeX4    |     9B      | LLM  |    Yes    |   Yes   |   Yes   |  -   |   -   |
+|  Phi-3.5-mini  |    3.8B     | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |
+|  Phi-3.5-MoE   |   16x3.8B   | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |
+| Phi-3.5-vision |    4.2B     | MLLM |    Yes    |   Yes   |   No    |  -   |   -   |
 
 ```{note}
 * Currently Mono-InternVL does not support FP16 due to numerical instability. Please use BF16 instead.
diff --git a/lmdeploy/lite/apis/calibrate.py b/lmdeploy/lite/apis/calibrate.py
index 0780e93594..71f7a5900c 100644
--- a/lmdeploy/lite/apis/calibrate.py
+++ b/lmdeploy/lite/apis/calibrate.py
@@ -27,6 +27,7 @@
     'ChatGLMForConditionalGeneration': 'GLMBlock',
     'MixtralForCausalLM': 'MixtralDecoderLayer',
     'Qwen2VLForConditionalGeneration': 'Qwen2VLDecoderLayer',
+    'MistralForCausalLM': 'MistralDecoderLayer',
 }
 
 NORM_TYPE_MAP = {
@@ -44,6 +45,7 @@
     'ChatGLMForConditionalGeneration': 'RMSNorm',
     'MixtralForCausalLM': 'MixtralRMSNorm',
     'Qwen2VLForConditionalGeneration': 'Qwen2RMSNorm',
+    'MistralForCausalLM': 'MistralRMSNorm',
 }
 
 HEAD_NAME_MAP = {
@@ -61,6 +63,7 @@
     'ChatGLMForConditionalGeneration': 'output_layer',
     'MixtralForCausalLM': 'lm_head',
     'Qwen2VLForConditionalGeneration': 'lm_head',
+    'MistralForCausalLM': 'lm_head',
 }
 
 
diff --git a/lmdeploy/lite/apis/smooth_quant.py b/lmdeploy/lite/apis/smooth_quant.py
index 45684602b2..c8df67355e 100644
--- a/lmdeploy/lite/apis/smooth_quant.py
+++ b/lmdeploy/lite/apis/smooth_quant.py
@@ -1,70 +1,15 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-
-import os.path as osp
-import shutil
-
 import fire
 import torch
 from torch import nn
 
-import lmdeploy
-from lmdeploy.lite.apis.calibrate import calibrate
+from lmdeploy.lite.apis.calibrate import (LAYER_TYPE_MAP, NORM_TYPE_MAP,
+                                          calibrate)
 from lmdeploy.lite.quantization.awq import (FC_FCS_MAP, NORM_FCS_MAP,
                                             awq_layers, smooth_layers)
 from lmdeploy.lite.utils import collect_target_modules
 from lmdeploy.pytorch.models import QLinear, QRMSNorm
 
-LAYER_TYPE_MAP = {
-    'InternLMForCausalLM': 'InternLMDecoderLayer',
-    'InternLM2ForCausalLM': 'InternLM2DecoderLayer',
-    'QWenLMHeadModel': 'QWenBlock',
-    'BaiChuanForCausalLM': 'DecoderLayer',
-    'LlamaForCausalLM': 'LlamaDecoderLayer',
-    'ChatGLMForConditionalGeneration': 'GLMBlock',
-}
-NORM_TYPE_MAP = {
-    'InternLMForCausalLM': 'InternLMRMSNorm',
-    'InternLM2ForCausalLM': 'InternLM2RMSNorm',
-    'QWenLMHeadModel': 'RMSNorm',
-    'BaiChuanForCausalLM': 'RMSNorm',
-    'LlamaForCausalLM': 'LlamaRMSNorm',
-    'ChatGLMForConditionalGeneration': 'RMSNorm',
-}
-
-LMDEPLOY_ROOT = lmdeploy.__path__[0]
-
-MODEL_PATH_MAP = {
-    'InternLMForCausalLM':
-    osp.join(LMDEPLOY_ROOT, 'pytorch/modeling/modeling_internlm.py'),
-    'InternLM2ForCausalLM':
-    osp.join(LMDEPLOY_ROOT, 'pytorch/modeling/modeling_internlm2.py'),
-    'LlamaForCausalLM':
-    osp.join(LMDEPLOY_ROOT, 'pytorch/modeling/modeling_llama.py'),
-    'BaiChuanForCausalLM':
-    osp.join(LMDEPLOY_ROOT, 'pytorch/modeling/modeling_baichuan.py')
-}
-
-AUTO_MAP = {
-    'InternLMForCausalLM': {
-        'AutoConfig': 'configuration_internlm.InternLMConfig',
-        'AutoModel': 'modeling_internlm.InternLMForCausalLM',
-        'AutoModelForCausalLM': 'modeling_internlm.InternLMForCausalLM'
-    },
-    'InternLM2ForCausalLM': {
-        'AutoConfig': 'configuration_internlm2.InternLMConfig',
-        'AutoModelForCausalLM': 'modeling_internlm2.InternLM2ForCausalLM',
-        'AutoModel': 'modeling_internlm2.InternLM2ForCausalLM'
-    },
-    'LlamaForCausalLM': {
-        'AutoModel': 'modeling_llama.LlamaForCausalLM',
-        'AutoModelForCausalLM': 'modeling_llama.LlamaForCausalLM'
-    },
-    'BaiChuanForCausalLM': {
-        'AutoConfig': 'configuration_baichuan.BaiChuanConfig',
-        'AutoModelForCausalLM': 'modeling_baichuan.BaiChuanForCausalLM'
-    }
-}
-
 
 def smooth_quant(model: str,
                  work_dir: str = './work_dir',
@@ -146,11 +91,6 @@ def smooth_quant(model: str,
         setattr(parent, child_name, q_norm)
         norm.to('cpu')
 
-    if hasattr(model.config, 'auto_map'):
-        model.config.auto_map.update(AUTO_MAP[type(model).__name__])
-    else:
-        model.config.auto_map = AUTO_MAP[type(model).__name__]
-
     if vl_model:
         from .auto_awq import save_vl_model
         save_vl_model(vl_model, model_path, work_dir)
@@ -162,8 +102,6 @@ def smooth_quant(model: str,
                               safe_serialization=False)
     tokenizer.save_pretrained(work_dir)
 
-    shutil.copy(MODEL_PATH_MAP[type(model).__name__], work_dir)
-
 
 if __name__ == '__main__':
     fire.Fire(smooth_quant)
diff --git a/lmdeploy/lite/quantization/awq.py b/lmdeploy/lite/quantization/awq.py
index 2efe41b6da..cf03a75216 100644
--- a/lmdeploy/lite/quantization/awq.py
+++ b/lmdeploy/lite/quantization/awq.py
@@ -50,7 +50,12 @@
         'input_layernorm':
         ['self_attn.k_proj', 'self_attn.q_proj', 'self_attn.v_proj'],
         'post_attention_layernorm': ['mlp.gate_proj', 'mlp.up_proj']
-    }
+    },
+    'MistralDecoderLayer': {
+        'input_layernorm':
+        ['self_attn.k_proj', 'self_attn.q_proj', 'self_attn.v_proj'],
+        'post_attention_layernorm': ['mlp.gate_proj', 'mlp.up_proj']
+    },
 }
 
 FC_FCS_MAP = {
@@ -92,6 +97,10 @@
     'Qwen2VLDecoderLayer': {
         'self_attn.v_proj': ['self_attn.o_proj'],
         'mlp.up_proj': ['mlp.down_proj']
+    },
+    'MistralDecoderLayer': {
+        'self_attn.v_proj': ['self_attn.o_proj'],
+        'mlp.up_proj': ['mlp.down_proj']
     }
 }
 
diff --git a/lmdeploy/pytorch/modeling/__init__.py b/lmdeploy/pytorch/modeling/__init__.py
deleted file mode 100644
index ef101fec61..0000000000
--- a/lmdeploy/pytorch/modeling/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/lmdeploy/pytorch/modeling/convert_to_qmodules.py b/lmdeploy/pytorch/modeling/convert_to_qmodules.py
deleted file mode 100644
index 4a95c9f165..0000000000
--- a/lmdeploy/pytorch/modeling/convert_to_qmodules.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch.nn as nn
-
-from lmdeploy.pytorch.models import QLinear, QRMSNorm
-
-LAYER_TYPE_MAP = {
-    'InternLMForCausalLM': 'InternLMDecoderLayer',
-    'InternLM2ForCausalLM': 'InternLM2DecoderLayer',
-    'QWenLMHeadModel': 'QWenBlock',
-    'BaiChuanForCausalLM': 'DecoderLayer',
-    'LlamaForCausalLM': 'LlamaDecoderLayer',
-}
-NORM_TYPE_MAP = {
-    'InternLMForCausalLM': 'InternLMRMSNorm',
-    'InternLM2ForCausalLM': 'InternLM2RMSNorm',
-    'QWenLMHeadModel': 'RMSNorm',
-    'BaiChuanForCausalLM': 'RMSNorm',
-    'LlamaForCausalLM': 'LlamaRMSNorm',
-}
-
-
-def convert_decoder_layer(module, norm_type):
-    """Converts a given module's child layers from regular Linear or RMSNorm to
-    their Quantized versions (QLinear, QRMSNorm).
-
-    The conversion is done in place.
-    """
-    for name, child in module.named_children():
-        if isinstance(child, nn.Linear):
-            new_child = QLinear.from_float(child, initialization=False)
-            setattr(module, name, new_child)
-        elif type(child).__name__ == norm_type:
-            new_child = QRMSNorm.from_float(child, initialization=False)
-            setattr(module, name, new_child)
-        else:
-            convert_decoder_layer(child, norm_type)
-
-
-def convert(module, layer_type, norm_type):
-    """Recursively traverses through given PyTorch module and identifies child
-    layers that match the specified layer_type and norm_type for conversion to
-    their Quantized counterparts.
-
-    The conversion is done using the `convert_decoder_layer` function.
-    """
-    for child in module.children():
-        if type(child).__name__ == layer_type:
-            convert_decoder_layer(child, norm_type)
-        else:
-            convert(child, layer_type, norm_type)
-
-
-def convert_to_qmodules(model):
-    """Convert all Linear and RMSNorm in the decoder layers of the model into
-    their Quantized versions (QLinear, QRMSNorm)."""
-    layer_type = LAYER_TYPE_MAP[type(model).__name__]
-    norm_type = NORM_TYPE_MAP[type(model).__name__]
-    convert(model, layer_type, norm_type)
-    return
diff --git a/lmdeploy/pytorch/modeling/modeling_baichuan.py b/lmdeploy/pytorch/modeling/modeling_baichuan.py
deleted file mode 100644
index a790e81d06..0000000000
--- a/lmdeploy/pytorch/modeling/modeling_baichuan.py
+++ /dev/null
@@ -1,824 +0,0 @@
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import CrossEntropyLoss
-from transformers import PreTrainedModel
-from transformers.activations import ACT2FN
-from transformers.modeling_outputs import (BaseModelOutputWithPast,
-                                           CausalLMOutputWithPast)
-
-from lmdeploy.pytorch.modeling.convert_to_qmodules import convert_to_qmodules
-from lmdeploy.utils import get_logger
-
-from .configuration_baichuan import BaiChuanConfig
-
-logger = get_logger('lmdeploy')
-
-
-# Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(input_ids_shape: torch.Size,
-                      dtype: torch.dtype,
-                      device: torch.device,
-                      past_key_values_length: int = 0):
-    """Make causal mask used for bi-directional self-attention."""
-    bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len),
-                      torch.tensor(torch.finfo(dtype).min, device=device),
-                      device=device)
-    mask_cond = torch.arange(mask.size(-1), device=device)
-    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
-    mask = mask.to(dtype)
-
-    if past_key_values_length > 0:
-        mask = torch.cat([
-            torch.zeros(
-                tgt_len, past_key_values_length, dtype=dtype, device=device),
-            mask
-        ],
-                         dim=-1)
-    return mask[None, None, :, :].expand(bsz, 1, tgt_len,
-                                         tgt_len + past_key_values_length)
-
-
-# Copied from transformers.models.bart.modeling_bart._expand_mask
-def _expand_mask(mask: torch.Tensor,
-                 dtype: torch.dtype,
-                 tgt_len: Optional[int] = None):
-    """Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len,
-    src_seq_len]`."""
-    bsz, src_len = mask.size()
-    tgt_len = tgt_len if tgt_len is not None else src_len
-
-    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len,
-                                                  src_len).to(dtype)
-
-    inverted_mask = 1.0 - expanded_mask
-
-    return inverted_mask.masked_fill(inverted_mask.to(torch.bool),
-                                     torch.finfo(dtype).min)
-
-
-class RMSNorm(nn.Module):
-
-    def __init__(self, hidden_size, eps=1e-6):
-        """RMSNorm is equivalent to T5LayerNorm."""
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        variance = hidden_states.to(torch.float32).pow(2).mean(-1,
-                                                               keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance +
-                                                    self.variance_epsilon)
-
-        # convert into half-precision if necessary
-        if self.weight.dtype in [torch.float16, torch.bfloat16]:
-            hidden_states = hidden_states.to(self.weight.dtype)
-
-        return self.weight * hidden_states
-
-
-class RotaryEmbedding(torch.nn.Module):
-    """RotaryEmbedding for Baichuan Model.
-
-    This module generates sine and cosine positional encodings based on
-    the paper "RoFormer: Enhanced Transformer with Rotary Position Embedding".
-    The purpose of this class is to provide positional embeddings to the
-    input tensors. It utilizes a cache mechanism to store precomputed
-    sine and cosine values for speedup.
-
-    Args:
-        dim (int): The dimensionality of the embeddings.
-        max_position_embeddings (int, optional): The maximum number of
-            position embeddings. Default is 2048.
-        base (int, optional): The base value for the inverse frequency
-            calculation. Default is 10000.
-        device (str, optional): The device to run operations on.
-            If None, defaults to the device of the model.
-    """
-
-    def __init__(self,
-                 dim,
-                 max_position_embeddings=2048,
-                 base=10000,
-                 device=None):
-        super().__init__()
-        index = (torch.arange(0, dim, 2).float().to(device) / dim)
-        inv_freq = 1.0 / (base**index)
-        self.register_buffer('inv_freq', inv_freq)
-
-        # Build here to make `torch.jit.trace` work.
-        self.max_seq_len_cached = max_position_embeddings
-        t = torch.arange(self.max_seq_len_cached,
-                         device=self.inv_freq.device,
-                         dtype=self.inv_freq.dtype)
-        freqs = torch.einsum('i,j->ij', t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order
-        # to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer('cos_cached',
-                             emb.cos()[None, None, :, :],
-                             persistent=False)
-        self.register_buffer('sin_cached',
-                             emb.sin()[None, None, :, :],
-                             persistent=False)
-
-    def forward(self, x, seq_len=None):
-        """Forward propagation method for the embedding layer.
-
-        Generates positional embeddings for the given input tensor.
-        """
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        # This `if` block is unlikely to be run after we build sin/cos in
-        # `__init__`. Keep the logic here just in case.
-        if seq_len > self.max_seq_len_cached:
-            self.max_seq_len_cached = seq_len
-            t = torch.arange(self.max_seq_len_cached,
-                             device=x.device,
-                             dtype=self.inv_freq.dtype)
-            freqs = torch.einsum('i,j->ij', t, self.inv_freq)
-            # Different from paper, but it uses a different permutation in
-            # order to obtain the same calculation
-            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
-            self.register_buffer('cos_cached',
-                                 emb.cos()[None, None, :, :],
-                                 persistent=False)
-            self.register_buffer('sin_cached',
-                                 emb.sin()[None, None, :, :],
-                                 persistent=False)
-        return (
-            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-        )
-
-
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., :x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2:]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
-    """Apply rotary positional embeddings to query and key tensors.
-
-    This function applies the cosine and sine positional embeddings on the
-    input query (q) and key (k) tensors using element-wise multiplication and
-    addition.
-    """
-    # The first two dimensions of cos and sin are always 1,
-    # so we can `squeeze` them.
-    cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
-    sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
-    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-class MLP(nn.Module):
-    """MLP for Baichuan Model."""
-
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
-    ):
-        super().__init__()
-        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
-        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
-        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
-        self.act_fn = ACT2FN[hidden_act]
-
-    def forward(self, x):
-        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-
-class Attention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper."""
-
-    def __init__(self, config: BaiChuanConfig):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.max_position_embeddings = config.max_position_embeddings
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError('hidden_size must be divisible by num_heads '
-                             f'(got `hidden_size`: {self.hidden_size}'
-                             f' and `num_heads`: {self.num_heads}).')
-        self.W_pack = nn.Linear(self.hidden_size,
-                                3 * self.hidden_size,
-                                bias=False)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim,
-                                self.hidden_size,
-                                bias=False)
-        self.rotary_emb = RotaryEmbedding(
-            self.head_dim,
-            max_position_embeddings=self.max_position_embeddings)
-
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads,
-                           self.head_dim).transpose(1, 2).contiguous()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
-               Optional[Tuple[torch.Tensor]]]:
-        """Forward propagation method for the attention layer."""
-        bsz, q_len, _ = hidden_states.size()
-
-        proj = self.W_pack(hidden_states)
-        proj = proj.unflatten(-1,
-                              (3, self.hidden_size)).unsqueeze(0).transpose(
-                                  0, -2).squeeze(-2)
-        query_states = proj[0].view(
-            bsz, q_len, self.num_heads, self.head_dim).transpose(
-                1, 2)  # batch_size x source_len x hidden_size
-        key_states = proj[1].view(bsz, q_len,
-                                  self.num_heads, self.head_dim).transpose(
-                                      1,
-                                      2)  # batch_size x target_len x head_size
-        value_states = proj[2].view(
-            bsz, q_len, self.num_heads, self.head_dim).transpose(
-                1, 2)  # batch_size x source_len x hidden_size
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(
-            query_states, key_states, cos, sin, position_ids)
-        # [bsz, nh, t, hd]
-
-        if past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-        past_key_value = (key_states, value_states) if use_cache else None
-
-        attn_weights = torch.matmul(query_states, key_states.transpose(
-            2, 3)) / math.sqrt(self.head_dim)
-
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                'Attention weights should be of size '
-                f'{(bsz, self.num_heads, q_len, kv_seq_len)}, but is'
-                f' {attn_weights.size()}')
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError('Attention mask should be of size '
-                                 f'{(bsz, 1, q_len, kv_seq_len)},'
-                                 f' but is {attention_mask.size()}')
-            attn_weights = attn_weights + attention_mask
-            attn_weights = torch.max(
-                attn_weights,
-                torch.tensor(torch.finfo(attn_weights.dtype).min))
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights,
-                                             dim=-1,
-                                             dtype=torch.float32).to(
-                                                 query_states.dtype)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                '`attn_output` should be of size '
-                f'{(bsz, self.num_heads, q_len, self.head_dim)}, but is'
-                f' {attn_output.size()}')
-
-        attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class DecoderLayer(nn.Module):
-    """Decoder layer for Baichuan Model."""
-
-    def __init__(self, config: BaiChuanConfig):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.self_attn = Attention(config=config)
-        self.mlp = MLP(
-            hidden_size=self.hidden_size,
-            intermediate_size=config.intermediate_size,
-            hidden_act=config.hidden_act,
-        )
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor,
-                                                 torch.FloatTensor]]]:
-        """  # noqa: E501
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states, )
-
-        if output_attentions:
-            outputs += (self_attn_weights, )
-
-        if use_cache:
-            outputs += (present_key_value, )
-
-        return outputs
-
-
-class PreTrainedModel(PreTrainedModel):
-    config_class = BaiChuanConfig
-    base_model_prefix = 'model'
-    supports_gradient_checkpointing = True
-    _no_split_modules = ['DecoderLayer']
-    _keys_to_ignore_on_load_unexpected = [r'decoder\.version']
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, Model):
-            module.gradient_checkpointing = value
-
-
-class Model(PreTrainedModel):
-    """Transformer decoder consisting of *config.num_hidden_layers* layers.
-    Each layer is a [`DecoderLayer`]
-
-    Args:
-        config: BaiChuanConfig
-    """
-
-    def __init__(self, config: BaiChuanConfig):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size,
-                                         self.padding_idx)
-        self.layers = nn.ModuleList(
-            [DecoderLayer(config) for _ in range(config.num_hidden_layers)])
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    # Copied from transformers.models.bart.modeling_bart.BartDecoder.
-    # prepare_decoder_attention_mask
-    def _prepare_decoder_attention_mask(self, attention_mask, input_shape,
-                                        inputs_embeds, past_key_values_length):
-        # create causal mask
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        combined_attention_mask = None
-        if input_shape[-1] > 1:
-            combined_attention_mask = _make_causal_mask(
-                input_shape,
-                inputs_embeds.dtype,
-                device=inputs_embeds.device,
-                past_key_values_length=past_key_values_length,
-            )
-
-        if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(attention_mask,
-                                              inputs_embeds.dtype,
-                                              tgt_len=input_shape[-1]).to(
-                                                  inputs_embeds.device)
-            combined_attention_mask = (expanded_attn_mask
-                                       if combined_attention_mask is None else
-                                       expanded_attn_mask +
-                                       combined_attention_mask)
-
-        return combined_attention_mask
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = (output_attentions if output_attentions is not None
-                             else self.config.output_attentions)
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        use_cache = (use_cache
-                     if use_cache is not None else self.config.use_cache)
-
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError('You cannot specify both decoder_input_ids '
-                             'and decoder_inputs_embeds at the same time')
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError('You have to specify either decoder_input_ids '
-                             'or decoder_inputs_embeds')
-
-        seq_length_with_past = seq_length
-        past_key_values_length = 0
-
-        if past_key_values is not None:
-            past_key_values_length = past_key_values[0][0].shape[2]
-            seq_length_with_past = (seq_length_with_past +
-                                    past_key_values_length)
-
-        if position_ids is None:
-            device = (input_ids.device
-                      if input_ids is not None else inputs_embeds.device)
-            position_ids = torch.arange(past_key_values_length,
-                                        seq_length + past_key_values_length,
-                                        dtype=torch.long,
-                                        device=device)
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-        # embed positions
-        if attention_mask is None:
-            attention_mask = torch.ones((batch_size, seq_length_with_past),
-                                        dtype=torch.bool,
-                                        device=inputs_embeds.device)
-        attention_mask = self._prepare_decoder_attention_mask(
-            attention_mask, (batch_size, seq_length), inputs_embeds,
-            past_key_values_length)
-
-        hidden_states = inputs_embeds
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    '`use_cache=True` is incompatible with gradient '
-                    'checkpointing. Setting `use_cache=False`...')
-                use_cache = False
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = () if use_cache else None
-
-        for idx, decoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                all_hidden_states += (hidden_states, )
-
-            past_key_value = past_key_values[
-                idx] if past_key_values is not None else None
-
-            if self.gradient_checkpointing and self.training:
-
-                def create_custom_forward(module):
-
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs, output_attentions, None)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(decoder_layer),
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    None,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_value,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache += (
-                    layer_outputs[2 if output_attentions else 1], )
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1], )
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states, )
-
-        next_cache = next_decoder_cache if use_cache else None
-        if not return_dict:
-            return tuple(
-                v for v in
-                [hidden_states, next_cache, all_hidden_states, all_self_attns]
-                if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class BaiChuanForCausalLM(PreTrainedModel):
-    """This class extends the `PreTrainedModel` to enable causal language
-    modeling.
-
-    It wraps the basic Baichuan model (`Model`) and includes a linear layer as
-    a language model head (`lm_head`). The purpose is to predict token
-    probabilities, given the previous tokens in the sequence.
-    """
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = Model(config)
-
-        self.lm_head = nn.Linear(config.hidden_size,
-                                 config.vocab_size,
-                                 bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-        convert_to_qmodules(self)
-
-    def get_input_embeddings(self):
-        """Get the token embedding layer."""
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        """Set the token embedding layer."""
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        """Get the output embedding layer."""
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        """Set the output embedding layer."""
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        """Set the decoder model."""
-        self.model = decoder
-
-    def get_decoder(self):
-        """Get the decoder model."""
-        return self.model
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""  # noqa: E501
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, ModelForCausalLM
-
-        >>> model = ModelForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
-        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
-
-        >>> prompt = "Hey, are you consciours? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
-        ```"""
-
-        output_attentions = (output_attentions if output_attentions is not None
-                             else self.config.output_attentions)
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
-
-        # decoder outputs consists of
-        # (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits, ) + outputs[1:]
-            return (loss, ) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(self,
-                                      input_ids,
-                                      past_key_values=None,
-                                      attention_mask=None,
-                                      inputs_embeds=None,
-                                      **kwargs):
-        """Prepare inputs for generating sequences using the model.
-
-        Args:
-            input_ids (torch.Tensor): Input token ids.
-            past_key_values (list[torch.Tensor], optional): List of past key
-                and value states.
-            attention_mask (torch.Tensor, optional): Mask indicating which
-                tokens should be attended to.
-            inputs_embeds (torch.FloatTensor, optional): Optionally,
-                the input embeddings instead of token ids.
-
-        Returns:
-            dict: Dictionary containing prepared inputs for model generation.
-        """
-        if past_key_values:
-            input_ids = input_ids[:, -1:]
-
-        position_ids = kwargs.get('position_ids', None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -1].unsqueeze(-1)
-
-        # if `inputs_embeds` are passed,
-        # we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {'inputs_embeds': inputs_embeds}
-        else:
-            model_inputs = {'input_ids': input_ids}
-
-        model_inputs.update({
-            'position_ids': position_ids,
-            'past_key_values': past_key_values,
-            'use_cache': kwargs.get('use_cache'),
-            'attention_mask': attention_mask,
-        })
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        """Reorder cached past key-values during generation using beam search.
-
-        This function reorders the cached past key-values according to the
-        given indices. It's useful in beam search where the order of hypotheses
-        can change from one time-step to another.
-        """
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (tuple(
-                past_state.index_select(0, beam_idx)
-                for past_state in layer_past), )
-        return reordered_past
diff --git a/lmdeploy/pytorch/modeling/modeling_internlm.py b/lmdeploy/pytorch/modeling/modeling_internlm.py
deleted file mode 100644
index c640641132..0000000000
--- a/lmdeploy/pytorch/modeling/modeling_internlm.py
+++ /dev/null
@@ -1,1171 +0,0 @@
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch InternLM model."""
-import math
-import queue
-import threading
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-from transformers.activations import ACT2FN
-from transformers.generation.streamers import BaseStreamer
-from transformers.modeling_outputs import (BaseModelOutputWithPast,
-                                           CausalLMOutputWithPast,
-                                           SequenceClassifierOutputWithPast)
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import (add_start_docstrings,
-                                add_start_docstrings_to_model_forward,
-                                replace_return_docstrings)
-
-from lmdeploy.pytorch.modeling.convert_to_qmodules import convert_to_qmodules
-from lmdeploy.utils import get_logger
-
-from .configuration_internlm import InternLMConfig
-
-logger = get_logger('lmdeploy')
-
-_CONFIG_FOR_DOC = 'InternLMConfig'
-
-
-# Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(input_ids_shape: torch.Size,
-                      dtype: torch.dtype,
-                      device: torch.device,
-                      past_key_values_length: int = 0):
-    """Make causal mask used for bi-directional self-attention."""
-    bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len),
-                      torch.tensor(torch.finfo(dtype).min, device=device),
-                      device=device)
-    mask_cond = torch.arange(mask.size(-1), device=device)
-    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
-    mask = mask.to(dtype)
-
-    if past_key_values_length > 0:
-        mask = torch.cat([
-            torch.zeros(
-                tgt_len, past_key_values_length, dtype=dtype, device=device),
-            mask
-        ],
-                         dim=-1)
-    return mask[None, None, :, :].expand(bsz, 1, tgt_len,
-                                         tgt_len + past_key_values_length)
-
-
-# Copied from transformers.models.bart.modeling_bart._expand_mask
-def _expand_mask(mask: torch.Tensor,
-                 dtype: torch.dtype,
-                 tgt_len: Optional[int] = None):
-    """Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len,
-    src_seq_len]`."""
-    bsz, src_len = mask.size()
-    tgt_len = tgt_len if tgt_len is not None else src_len
-
-    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len,
-                                                  src_len).to(dtype)
-
-    inverted_mask = 1.0 - expanded_mask
-
-    return inverted_mask.masked_fill(inverted_mask.to(torch.bool),
-                                     torch.finfo(dtype).min)
-
-
-class InternLMRMSNorm(nn.Module):
-
-    def __init__(self, hidden_size, eps=1e-6):
-        """InternLMRMSNorm is equivalent to T5LayerNorm."""
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        variance = hidden_states.to(torch.float32).pow(2).mean(-1,
-                                                               keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance +
-                                                    self.variance_epsilon)
-
-        # convert into half-precision if necessary
-        if self.weight.dtype in [torch.float16, torch.bfloat16]:
-            hidden_states = hidden_states.to(self.weight.dtype)
-
-        return self.weight * hidden_states
-
-
-class InternLMRotaryEmbedding(torch.nn.Module):
-    """RotaryEmbedding for InternLM Model.
-
-    This module generates sine and cosine positional encodings based on
-    the paper "RoFormer: Enhanced Transformer with Rotary Position Embedding".
-    The purpose of this class is to provide positional embeddings to the
-    input tensors. It utilizes a cache mechanism to store precomputed
-    sine and cosine values for speedup.
-
-    Args:
-        dim (int): The dimensionality of the embeddings.
-        max_position_embeddings (int, optional): The maximum number of
-            position embeddings. Default is 2048.
-        base (int, optional): The base value for the inverse frequency
-            calculation. Default is 10000.
-        device (str, optional): The device to run operations on.
-            If None, defaults to the device of the model.
-    """
-
-    def __init__(self,
-                 dim,
-                 max_position_embeddings=2048,
-                 base=10000,
-                 device=None):
-        super().__init__()
-        index = (torch.arange(0, dim, 2).float().to(device) / dim)
-        inv_freq = 1.0 / (base**index)
-        self.register_buffer('inv_freq', inv_freq, persistent=False)
-
-        # Build here to make `torch.jit.trace` work.
-        self.max_seq_len_cached = max_position_embeddings
-        t = torch.arange(self.max_seq_len_cached,
-                         device=self.inv_freq.device,
-                         dtype=self.inv_freq.dtype)
-        freqs = torch.einsum('i,j->ij', t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order
-        # to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer('cos_cached',
-                             emb.cos()[None, None, :, :],
-                             persistent=False)
-        self.register_buffer('sin_cached',
-                             emb.sin()[None, None, :, :],
-                             persistent=False)
-
-    def forward(self, x, seq_len=None):
-        """Forward propagation method for the embedding layer.
-
-        Generates positional embeddings for the given input tensor.
-        """
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        # This `if` block is unlikely to be run after we build sin/cos in
-        # `__init__`. Keep the logic here just in case.
-        if seq_len > self.max_seq_len_cached:
-            self.max_seq_len_cached = seq_len
-            t = torch.arange(self.max_seq_len_cached,
-                             device=x.device,
-                             dtype=self.inv_freq.dtype)
-            freqs = torch.einsum('i,j->ij', t, self.inv_freq)
-            # Different from paper, but it uses a different permutation in
-            # order to obtain the same calculation
-            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
-            self.register_buffer('cos_cached',
-                                 emb.cos()[None, None, :, :],
-                                 persistent=False)
-            self.register_buffer('sin_cached',
-                                 emb.sin()[None, None, :, :],
-                                 persistent=False)
-        return (
-            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-        )
-
-
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., :x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2:]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
-    """Apply rotary positional embeddings to query and key tensors.
-
-    This function applies the cosine and sine positional embeddings on the
-    input query (q) and key (k) tensors using element-wise multiplication and
-    addition.
-    """
-    # The first two dimensions of cos and sin are always 1, so we can
-    # `squeeze` them.
-    cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
-    sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
-    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-class InternLMMLP(nn.Module):
-    """MLP for InternLM Model."""
-
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
-    ):
-        super().__init__()
-        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
-        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
-        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
-        self.act_fn = ACT2FN[hidden_act]
-
-    def forward(self, x):
-        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-
-class InternLMAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper."""
-
-    def __init__(self, config: InternLMConfig):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.max_position_embeddings = config.max_position_embeddings
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError('hidden_size must be divisible by num_heads '
-                             f'(got `hidden_size`: {self.hidden_size}'
-                             f' and `num_heads`: {self.num_heads}).')
-        self.q_proj = nn.Linear(self.hidden_size,
-                                self.num_heads * self.head_dim,
-                                bias=config.bias)
-        self.k_proj = nn.Linear(self.hidden_size,
-                                self.num_heads * self.head_dim,
-                                bias=config.bias)
-        self.v_proj = nn.Linear(self.hidden_size,
-                                self.num_heads * self.head_dim,
-                                bias=config.bias)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim,
-                                self.hidden_size,
-                                bias=config.bias)
-        self.rotary_emb = InternLMRotaryEmbedding(
-            self.head_dim,
-            max_position_embeddings=self.max_position_embeddings)
-
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads,
-                           self.head_dim).transpose(1, 2).contiguous()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
-               Optional[Tuple[torch.Tensor]]]:
-        """Forward propagation method for the attention layer."""
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states).view(
-            bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = self.k_proj(hidden_states).view(
-            bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        value_states = self.v_proj(hidden_states).view(
-            bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(
-            query_states, key_states, cos, sin, position_ids)
-        # [bsz, nh, t, hd]
-
-        if past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-        past_key_value = (key_states, value_states) if use_cache else None
-
-        attn_weights = torch.matmul(query_states, key_states.transpose(
-            2, 3)) / math.sqrt(self.head_dim)
-
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                'Attention weights should be of size '
-                f'{(bsz, self.num_heads, q_len, kv_seq_len)}, but is'
-                f' {attn_weights.size()}')
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError('Attention mask should be of size '
-                                 f'{(bsz, 1, q_len, kv_seq_len)}, '
-                                 f'but is {attention_mask.size()}')
-            attn_weights = attn_weights + attention_mask
-            attn_weights = torch.max(
-                attn_weights,
-                torch.tensor(torch.finfo(attn_weights.dtype).min))
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights,
-                                             dim=-1,
-                                             dtype=torch.float32).to(
-                                                 query_states.dtype)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                'attn_output` should be of size '
-                f'`{(bsz, self.num_heads, q_len, self.head_dim)}, but is'
-                f' {attn_output.size()}')
-
-        attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class InternLMDecoderLayer(nn.Module):
-    """Decoder layer for InternLM Model."""
-
-    def __init__(self, config: InternLMConfig):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.self_attn = InternLMAttention(config=config)
-        self.mlp = InternLMMLP(
-            hidden_size=self.hidden_size,
-            intermediate_size=config.intermediate_size,
-            hidden_act=config.hidden_act,
-        )
-        self.input_layernorm = InternLMRMSNorm(config.hidden_size,
-                                               eps=config.rms_norm_eps)
-        self.post_attention_layernorm = InternLMRMSNorm(
-            config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor,
-                                                 torch.FloatTensor]]]:
-        """ # noqa: E501
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states, )
-
-        if output_attentions:
-            outputs += (self_attn_weights, )
-
-        if use_cache:
-            outputs += (present_key_value, )
-
-        return outputs
-
-
-INTERNLM_START_DOCSTRING = r""" # noqa: E501
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`InternLMConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    'The bare InternLM Model outputting raw hidden-states without any specific head on top.',  # noqa: E501
-    INTERNLM_START_DOCSTRING,
-)
-class InternLMPreTrainedModel(PreTrainedModel):
-    config_class = InternLMConfig
-    base_model_prefix = 'model'
-    supports_gradient_checkpointing = True
-    _no_split_modules = ['InternLMDecoderLayer']
-    _keys_to_ignore_on_load_unexpected = [r'decoder\.version']
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, InternLMModel):
-            module.gradient_checkpointing = value
-
-
-INTERNLM_INPUTS_DOCSTRING = r""" # noqa: E501
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    'The bare InternLM Model outputting raw hidden-states without any specific head on top.',  # noqa: E501
-    INTERNLM_START_DOCSTRING,
-)
-class InternLMModel(InternLMPreTrainedModel):
-    """Transformer decoder consisting of *config.num_hidden_layers* layers.
-    Each layer is a [`InternLMDecoderLayer`]
-
-    Args:
-        config: InternLMConfig
-    """
-    _auto_class = 'AutoModel'
-
-    def __init__(self, config: InternLMConfig):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size,
-                                         self.padding_idx)
-        self.layers = nn.ModuleList([
-            InternLMDecoderLayer(config)
-            for _ in range(config.num_hidden_layers)
-        ])
-        self.norm = InternLMRMSNorm(config.hidden_size,
-                                    eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    # Copied from transformers.models.bart.modeling_bart.BartDecoder.
-    # prepare_decoder_attention_mask
-    def _prepare_decoder_attention_mask(self, attention_mask, input_shape,
-                                        inputs_embeds, past_key_values_length):
-        # create causal mask
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        combined_attention_mask = None
-        if input_shape[-1] > 1:
-            combined_attention_mask = _make_causal_mask(
-                input_shape,
-                inputs_embeds.dtype,
-                device=inputs_embeds.device,
-                past_key_values_length=past_key_values_length,
-            )
-
-        if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(attention_mask,
-                                              inputs_embeds.dtype,
-                                              tgt_len=input_shape[-1]).to(
-                                                  inputs_embeds.device)
-            combined_attention_mask = (expanded_attn_mask
-                                       if combined_attention_mask is None else
-                                       expanded_attn_mask +
-                                       combined_attention_mask)
-
-        return combined_attention_mask
-
-    @add_start_docstrings_to_model_forward(INTERNLM_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions or self.config.output_attentions
-        output_hidden_states = (output_hidden_states
-                                or self.config.output_hidden_states)
-        use_cache = use_cache or self.config.use_cache
-
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError('You cannot specify both decoder_input_ids '
-                             'and decoder_inputs_embeds at the same time')
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError('You have to specify either decoder_input_ids '
-                             'or decoder_inputs_embeds')
-
-        seq_length_with_past = seq_length
-        past_key_values_length = 0
-
-        if past_key_values is not None:
-            past_key_values_length = past_key_values[0][0].shape[2]
-            seq_length_with_past = (seq_length_with_past +
-                                    past_key_values_length)
-
-        if position_ids is None:
-            device = (input_ids.device
-                      if input_ids is not None else inputs_embeds.device)
-            position_ids = torch.arange(past_key_values_length,
-                                        seq_length + past_key_values_length,
-                                        dtype=torch.long,
-                                        device=device)
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-        # embed positions
-        if attention_mask is None:
-            attention_mask = torch.ones((batch_size, seq_length_with_past),
-                                        dtype=torch.bool,
-                                        device=inputs_embeds.device)
-        attention_mask = self._prepare_decoder_attention_mask(
-            attention_mask, (batch_size, seq_length), inputs_embeds,
-            past_key_values_length)
-
-        hidden_states = inputs_embeds
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    '`use_cache=True` is incompatible with gradient '
-                    'checkpointing. Setting `use_cache=False`...')
-                use_cache = False
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = () if use_cache else None
-
-        for idx, decoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                all_hidden_states += (hidden_states, )
-
-            past_key_value = past_key_values[
-                idx] if past_key_values is not None else None
-
-            if self.gradient_checkpointing and self.training:
-
-                def create_custom_forward(module):
-
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs, output_attentions, None)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(decoder_layer),
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    None,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_value,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache += (
-                    layer_outputs[2 if output_attentions else 1], )
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1], )
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states, )
-
-        next_cache = next_decoder_cache if use_cache else None
-        if not return_dict:
-            return tuple(
-                v for v in
-                [hidden_states, next_cache, all_hidden_states, all_self_attns]
-                if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class InternLMForCausalLM(InternLMPreTrainedModel):
-    """This class extends the `InternLMPreTrainedModel` to enable causal
-    language modeling.
-
-    It wraps the basic InternLM model (`InternLMModel`) and includes a linear
-    layer as a language model head (`lm_head`). The purpose is to predict token
-    probabilities, given the previous tokens in the sequence.
-    """
-    _auto_class = 'AutoModelForCausalLM'
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = InternLMModel(config)
-
-        self.lm_head = nn.Linear(config.hidden_size,
-                                 config.vocab_size,
-                                 bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-        convert_to_qmodules(self)
-
-    def get_input_embeddings(self):
-        """Get the token embedding layer."""
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        """Set the token embedding layer."""
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        """Get the output embedding layer."""
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        """Set the output embedding layer."""
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        """Set the decoder model."""
-        self.model = decoder
-
-    def get_decoder(self):
-        """Get the decoder model."""
-        return self.model
-
-    @add_start_docstrings_to_model_forward(INTERNLM_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast,
-                               config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""  # noqa: E501
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, InternLMForCausalLM
-
-        >>> model = InternLMForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
-        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
-
-        >>> prompt = "Hey, are you consciours? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
-        ```"""
-
-        output_attentions = output_attentions or self.config.output_attentions
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
-
-        # decoder outputs consists of
-        # (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits, ) + outputs[1:]
-            return (loss, ) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(self,
-                                      input_ids,
-                                      past_key_values=None,
-                                      attention_mask=None,
-                                      inputs_embeds=None,
-                                      **kwargs):
-        """Prepare inputs for generating sequences using the model.
-
-        Args:
-            input_ids (torch.Tensor): Input token ids.
-            past_key_values (list[torch.Tensor], optional): List of past key
-                and value states.
-            attention_mask (torch.Tensor, optional): Mask indicating which
-                tokens should be attended to.
-            inputs_embeds (torch.FloatTensor, optional): Optionally,
-                the input embeddings instead of token ids.
-
-        Returns:
-            dict: Dictionary containing prepared inputs for model generation.
-        """
-        if past_key_values:
-            input_ids = input_ids[:, -1:]
-
-        position_ids = kwargs.get('position_ids', None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -1].unsqueeze(-1)
-
-        # if `inputs_embeds` are passed,
-        # we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {'inputs_embeds': inputs_embeds}
-        else:
-            model_inputs = {'input_ids': input_ids}
-
-        model_inputs.update({
-            'position_ids': position_ids,
-            'past_key_values': past_key_values,
-            'use_cache': kwargs.get('use_cache'),
-            'attention_mask': attention_mask,
-        })
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        """Reorder cached past key-values during generation using beam search.
-
-        This function reorders the cached past key-values according to the
-        given indices. It's useful in beam search where the order of hypotheses
-        can change from one time-step to another.
-        """
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (tuple(
-                past_state.index_select(0, beam_idx)
-                for past_state in layer_past), )
-        return reordered_past
-
-    def build_inputs(self,
-                     tokenizer,
-                     query: str,
-                     history: List[Tuple[str, str]] = []):
-        """Builds the input for the model."""
-        prompt = ''
-        for record in history:
-            prompt += f"""<|User|>:{record[0]}<eoh>\n<|Bot|>:{record[1]}<eoa>\n"""  # noqa: E501
-        prompt += f"""<|User|>:{query}<eoh>\n<|Bot|>:"""
-        return tokenizer([prompt], return_tensors='pt')
-
-    @torch.no_grad()
-    def chat(self,
-             tokenizer,
-             query: str,
-             history: List[Tuple[str, str]] = [],
-             streamer: Optional[BaseStreamer] = None,
-             max_new_tokens: int = 1024,
-             do_sample: bool = True,
-             temperature: float = 0.8,
-             top_p: float = 0.8,
-             **kwargs):
-        """Provides a chatting functionality for the model."""
-        inputs = self.build_inputs(tokenizer, query, history)
-        inputs = {
-            k: v.to(self.device)
-            for k, v in inputs.items() if torch.is_tensor(v)
-        }
-        outputs = self.generate(**inputs,
-                                streamer=streamer,
-                                max_new_tokens=max_new_tokens,
-                                do_sample=do_sample,
-                                temperature=temperature,
-                                top_p=top_p,
-                                **kwargs)
-        outputs = outputs[0].cpu().tolist()[len(inputs['input_ids'][0]):]
-        response = tokenizer.decode(outputs, skip_special_tokens=True)
-        response = response.split('<eoa>')[0]
-        history = history + [(query, response)]
-        return response, history
-
-    @torch.no_grad()
-    def stream_chat(self,
-                    tokenizer,
-                    query: str,
-                    history: List[Tuple[str, str]] = [],
-                    max_new_tokens: int = 1024,
-                    do_sample: bool = True,
-                    temperature: float = 0.8,
-                    top_p: float = 0.8,
-                    **kwargs):
-        """Return a generator in format: (response, history) Eg.
-
-        ('你好，有什么可以帮助您的吗', [('你好', '你好，有什么可以帮助您的吗')]) ('你好，有什么可以帮助您的吗？', [('你好',
-        '你好，有什么可以帮助您的吗？')])
-        """
-
-        response_queue = queue.Queue(maxsize=20)
-
-        class ChatStreamer(BaseStreamer):
-
-            def __init__(self, tokenizer) -> None:
-                super().__init__()
-                self.tokenizer = tokenizer
-                self.queue = response_queue
-                self.query = query
-                self.history = history
-                self.response = ''
-                self.received_inputs = False
-                self.queue.put(
-                    (self.response, history + [(self.query, self.response)]))
-
-            def put(self, value):
-                if len(value.shape) > 1 and value.shape[0] > 1:
-                    raise ValueError('ChatStreamer only supports batch size 1')
-                elif len(value.shape) > 1:
-                    value = value[0]
-
-                if not self.received_inputs:
-                    # The first received value is input_ids, ignore here
-                    self.received_inputs = True
-                    return
-
-                token = self.tokenizer.decode([value[-1]],
-                                              skip_special_tokens=True)
-                if token.strip() != '<eoa>':
-                    self.response = self.response + token
-                    history = self.history + [(self.query, self.response)]
-                    self.queue.put((self.response, history))
-
-            def end(self):
-                self.queue.put(None)
-
-        def stream_producer():
-            return self.chat(tokenizer=tokenizer,
-                             query=query,
-                             streamer=ChatStreamer(tokenizer=tokenizer),
-                             history=history,
-                             max_new_tokens=max_new_tokens,
-                             do_sample=do_sample,
-                             temperature=temperature,
-                             top_p=top_p,
-                             **kwargs)
-
-        def consumer():
-            producer = threading.Thread(target=stream_producer)
-            producer.start()
-            while True:
-                res = response_queue.get()
-                if res is None:
-                    return
-                yield res
-
-        return consumer()
-
-
-@add_start_docstrings(
-    """  # noqa: E501
-    The InternLM Model transformer with a sequence classification head on top (linear layer).
-
-    [`InternLMForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    INTERNLM_START_DOCSTRING,
-)
-class InternLMForSequenceClassification(InternLMPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r'lm_head.weight']
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = InternLMModel(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(INTERNLM_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""  # noqa: E501
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
-
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError(
-                'Cannot handle batch sizes > 1 if no padding token is defined.'
-            )
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                sequence_lengths = (
-                    torch.ne(input_ids, self.config.pad_token_id).sum(-1) -
-                    1).to(logits.device)
-            else:
-                sequence_lengths = -1
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device),
-                               sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = 'regression'
-                elif self.num_labels > 1 and (labels.dtype == torch.long
-                                              or labels.dtype == torch.int):
-                    self.config.problem_type = 'single_label_classification'
-                else:
-                    self.config.problem_type = 'multi_label_classification'
-
-            if self.config.problem_type == 'regression':
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == 'single_label_classification':
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels),
-                                labels.view(-1))
-            elif self.config.problem_type == 'multi_label_classification':
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits, ) + transformer_outputs[1:]
-            return ((loss, ) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
diff --git a/lmdeploy/pytorch/modeling/modeling_internlm2.py b/lmdeploy/pytorch/modeling/modeling_internlm2.py
deleted file mode 100644
index cb61df0256..0000000000
--- a/lmdeploy/pytorch/modeling/modeling_internlm2.py
+++ /dev/null
@@ -1,1940 +0,0 @@
-# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on transformers/src/transformers/models/llama/modeling_llama.py
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch InternLM2 model."""
-import math
-import queue
-import threading
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from einops import rearrange
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-from transformers.activations import ACT2FN
-from transformers.cache_utils import Cache, DynamicCache, StaticCache
-from transformers.modeling_attn_mask_utils import AttentionMaskConverter
-from transformers.modeling_outputs import (BaseModelOutputWithPast,
-                                           CausalLMOutputWithPast,
-                                           QuestionAnsweringModelOutput,
-                                           SequenceClassifierOutputWithPast,
-                                           TokenClassifierOutput)
-from transformers.modeling_utils import PreTrainedModel
-from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
-from transformers.utils import (add_start_docstrings,
-                                add_start_docstrings_to_model_forward,
-                                is_flash_attn_2_available,
-                                is_flash_attn_greater_or_equal_2_10, logging,
-                                replace_return_docstrings)
-
-from lmdeploy.pytorch.modeling.convert_to_qmodules import convert_to_qmodules
-
-try:
-    from transformers.generation.streamers import BaseStreamer
-except Exception:
-    BaseStreamer = None
-
-from .configuration_internlm2 import InternLM2Config
-
-if is_flash_attn_2_available():
-    from flash_attn import flash_attn_func, flash_attn_varlen_func
-    from flash_attn.bert_padding import (index_first_axis, pad_input,
-                                         unpad_input)
-
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = 'InternLM2Config'
-
-
-def _get_unpad_data(attention_mask):
-    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
-    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0,
-                                    dtype=torch.int32), (1, 0))  # pylint: disable=E1102
-    return (
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
-
-
-class InternLM2RMSNorm(nn.Module):
-    """InternLM2RMSNorm is equivalent to T5LayerNorm."""
-
-    def __init__(self, hidden_size, eps=1e-6):
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance +
-                                                    self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-
-ALL_LAYERNORM_LAYERS.append(InternLM2RMSNorm)
-
-
-class InternLM2RotaryEmbedding(nn.Module):
-    """Rotary Position Embedding for the InternLM2 model.
-
-    Credits to the Reddit user /u/lucidrains.
-    """
-
-    def __init__(self,
-                 dim,
-                 max_position_embeddings=2048,
-                 base=10000,
-                 device=None,
-                 scaling_factor=1.0):
-        super().__init__()
-        self.scaling_factor = scaling_factor
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base**(torch.arange(
-            0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
-        self.register_buffer('inv_freq', inv_freq, persistent=False)
-        # For BC we register cos and sin cached
-        self.max_seq_len_cached = max_position_embeddings
-
-    @torch.no_grad()
-    def forward(self, x, position_ids):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(
-            position_ids.shape[0], -1, 1)
-        position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 since bfloat16 loses precision on long contexts
-        # See https://github.com/huggingface/transformers/pull/29285
-        device_type = x.device.type
-        device_type = device_type if isinstance(
-            device_type, str) and device_type != 'mps' else 'cpu'
-        with torch.autocast(device_type=device_type, enabled=False):
-            freqs = (inv_freq_expanded.float()
-                     @ position_ids_expanded.float()).transpose(1, 2)
-            emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
-
-
-class InternLM2LinearScalingRotaryEmbedding(InternLM2RotaryEmbedding):
-    """InternLM2RotaryEmbedding extended with linear scaling.
-
-    Credits to the Reddit user /u/kaiokendev
-    """
-
-    def forward(self, x, position_ids):
-        # difference to the original RoPE: a scaling factor is applied to the position ids
-        position_ids = position_ids.float() / self.scaling_factor
-        cos, sin = super().forward(x, position_ids)
-        return cos, sin
-
-
-class InternLM2DynamicNTKScalingRotaryEmbedding(InternLM2RotaryEmbedding):
-    """InternLM2RotaryEmbedding extended with Dynamic NTK scaling.
-
-    Credits to the Reddit users /u/bloc97 and /u/emozilla
-    """
-
-    def forward(self, x, position_ids):
-        # difference to the original RoPE: inv_freq is recomputed when the sequence length > original length
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_position_embeddings:
-            base = self.base * ((self.scaling_factor * seq_len /
-                                 self.max_position_embeddings) -
-                                (self.scaling_factor - 1))**(self.dim /
-                                                             (self.dim - 2))
-            inv_freq = 1.0 / (base**(torch.arange(
-                0, self.dim, 2, dtype=torch.int64).float().to(x.device) /
-                                     self.dim))
-            self.register_buffer(
-                'inv_freq', inv_freq,
-                persistent=False)  # TODO joao: this may break with compilation
-
-        cos, sin = super().forward(x, position_ids)
-        return cos, sin
-
-
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., :x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2:]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):  # pylint: disable=unused-argument
-    """Applies Rotary Position Embedding to the query and key tensors.
-
-    Args:
-        q (`torch.Tensor`): The query tensor.
-        k (`torch.Tensor`): The key tensor.
-        cos (`torch.Tensor`): The cosine part of the rotary embedding.
-        sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`, *optional*):
-            Deprecated and unused.
-        unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
-    Returns:
-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
-    """
-    cos = cos.unsqueeze(unsqueeze_dim)
-    sin = sin.unsqueeze(unsqueeze_dim)
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-class InternLM2MLP(nn.Module):
-    """MLP for InternLM2 model."""
-
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.w1 = nn.Linear(self.hidden_size,
-                            self.intermediate_size,
-                            bias=False)
-        self.w3 = nn.Linear(self.hidden_size,
-                            self.intermediate_size,
-                            bias=False)
-        self.w2 = nn.Linear(self.intermediate_size,
-                            self.hidden_size,
-                            bias=False)
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        down_proj = self.w2(self.act_fn(self.w1(x)) * self.w3(x))
-
-        return down_proj
-
-
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """This is the equivalent of torch.repeat_interleave(x, dim=1,
-    repeats=n_rep).
-
-    The hidden states go from (batch, num_key_value_heads, seqlen, head_dim) to
-    (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :,
-                                  None, :, :].expand(batch,
-                                                     num_key_value_heads,
-                                                     n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen,
-                                 head_dim)
-
-
-class InternLM2Attention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper."""
-
-    def __init__(self,
-                 config: InternLM2Config,
-                 layer_idx: Optional[int] = None):
-        super().__init__()
-        self.config = config
-        self.layer_idx = layer_idx
-        if layer_idx is None:
-            logger.warning_once(
-                f'Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will '
-                'lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` '
-                'when creating this class.')
-
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
-        self.is_causal = True
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f'hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}'
-                f' and `num_heads`: {self.num_heads}).')
-
-        self.wqkv = nn.Linear(
-            self.hidden_size,
-            (self.num_heads + 2 * self.num_key_value_heads) * self.head_dim,
-            bias=config.bias,
-        )
-        self.wo = nn.Linear(self.num_heads * self.head_dim,
-                            self.hidden_size,
-                            bias=config.bias)
-
-        self._init_rope()
-
-    def _init_rope(self):
-        if self.config.rope_scaling is None:
-            self.rotary_emb = InternLM2RotaryEmbedding(
-                self.head_dim,
-                max_position_embeddings=self.max_position_embeddings,
-                base=self.rope_theta,
-            )
-        else:
-            scaling_type = self.config.rope_scaling['type']
-            scaling_factor = self.config.rope_scaling['factor']
-            if scaling_type == 'linear':
-                self.rotary_emb = InternLM2LinearScalingRotaryEmbedding(
-                    self.head_dim,
-                    max_position_embeddings=self.max_position_embeddings,
-                    scaling_factor=scaling_factor,
-                    base=self.rope_theta,
-                )
-            elif scaling_type == 'dynamic':
-                self.rotary_emb = InternLM2DynamicNTKScalingRotaryEmbedding(
-                    self.head_dim,
-                    max_position_embeddings=self.max_position_embeddings,
-                    scaling_factor=scaling_factor,
-                    base=self.rope_theta,
-                )
-            else:
-                raise ValueError(f'Unknown RoPE scaling type {scaling_type}')
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,  # pylint: disable=unused-argument
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
-               Optional[Tuple[torch.Tensor]]]:
-        bsz, q_len, _ = hidden_states.size()
-
-        if self.config.pretraining_tp > 1:
-            # split qkv_states by tp size
-            key_value_slicing = (self.num_key_value_heads *
-                                 self.head_dim) // self.config.pretraining_tp
-            qkv_slices = self.wqkv.weight.split(key_value_slicing, dim=0)
-            qkv_states = torch.cat(
-                [
-                    F.linear(hidden_states, qkv_slice)
-                    for qkv_slice in qkv_slices
-                ],
-                dim=-1  # pylint: disable=E1102
-            )
-        else:
-            qkv_states = self.wqkv(hidden_states)
-
-        qkv_states = rearrange(
-            qkv_states,
-            'b q (h gs d) -> b q h gs d',
-            gs=2 + self.num_key_value_groups,
-            d=self.head_dim,
-        )
-
-        query_states = qkv_states[..., :self.num_key_value_groups, :]
-        query_states = rearrange(query_states,
-                                 'b q h gs d -> b q (h gs) d').transpose(1, 2)
-        key_states = qkv_states[..., -2, :].transpose(1, 2)
-        value_states = qkv_states[..., -1, :].transpose(1, 2)
-
-        cos, sin = self.rotary_emb(value_states, position_ids)
-        query_states, key_states = apply_rotary_pos_emb(
-            query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            # sin and cos are specific to RoPE models; cache_position needed for the static cache
-            cache_kwargs = {
-                'sin': sin,
-                'cos': cos,
-                'cache_position': cache_position
-            }
-            key_states, value_states = past_key_value.update(
-                key_states, value_states, self.layer_idx, cache_kwargs)
-
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        attn_weights = torch.matmul(query_states, key_states.transpose(
-            2, 3)) / math.sqrt(self.head_dim)
-
-        if attention_mask is not None:  # no matter the length, we just slice it
-            causal_mask = attention_mask[:, :, :, :key_states.shape[-2]]
-            attn_weights = attn_weights + causal_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights,
-                                             dim=-1,
-                                             dtype=torch.float32).to(
-                                                 query_states.dtype)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f'`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is'
-                f' {attn_output.size()}')
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        if self.config.pretraining_tp > 1:
-            attn_output = attn_output.split(self.hidden_size //
-                                            self.config.pretraining_tp,
-                                            dim=2)
-            o_proj_slices = self.wo.weight.split(self.hidden_size //
-                                                 self.config.pretraining_tp,
-                                                 dim=1)
-            attn_output = sum([
-                F.linear(attn_output[i], o_proj_slices[i])  # pylint: disable=E1102
-                for i in range(self.config.pretraining_tp)
-            ])
-        else:
-            attn_output = self.wo(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class InternLM2FlashAttention2(InternLM2Attention):
-    """InternLM2 flash attention module.
-
-    This module inherits from `InternLM2Attention` as the weights of the module
-    stays untouched. The only required change would be on the forward pass
-    where it needs to correctly call the public API of flash attention and deal
-    with padding tokens in case the input contains any of them.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment,
-        #   that was made default for flash_attn>=2.1. This attribute is used to handle this difference.
-        # Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1)
-        #   produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10(
-        )
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
-               Optional[Tuple[torch.Tensor]]]:
-        if isinstance(past_key_value, StaticCache):
-            raise ValueError(
-                '`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` '
-                'make sure to use `sdpa` in the mean time, and open an issue at '
-                'https://github.com/huggingface/transformers')
-
-        output_attentions = False
-
-        bsz, q_len, _ = hidden_states.size()
-
-        qkv_states = self.wqkv(hidden_states)
-
-        qkv_states = rearrange(
-            qkv_states,
-            'b q (h gs d) -> b q h gs d',
-            gs=2 + self.num_key_value_groups,
-            d=self.head_dim,
-        )
-
-        query_states = qkv_states[..., :self.num_key_value_groups, :]
-        query_states = rearrange(query_states, 'b q h gs d -> b q (h gs) d')
-        key_states = qkv_states[..., -2, :]
-        value_states = qkv_states[..., -1, :]
-
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        cos, sin = self.rotary_emb(value_states, position_ids)
-        query_states, key_states = apply_rotary_pos_emb(
-            query_states, key_states, cos, sin)
-
-        if past_key_value is not None:
-            # sin and cos are specific to RoPE models; cache_position needed for the static cache
-            cache_kwargs = {
-                'sin': sin,
-                'cos': cos,
-                'cache_position': cache_position
-            }
-            key_states, value_states = past_key_value.update(
-                key_states, value_states, self.layer_idx, cache_kwargs)
-
-        # TODO: These transpose are quite inefficient but Flash Attention requires the layout
-        # [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
-        # to be able to avoid many of these transpose/reshape/view.
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        # dropout_rate = self.attention_dropout if self.training else 0.0
-        dropout_rate = 0.0
-
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in the correct dtype just to be sure everything works as expected.
-        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
-        # in fp32. (InternLM2RMSNorm handles it correctly)
-
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, '_pre_quantization_dtype'):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.wqkv.weight.dtype
-
-            logger.warning_once(
-                f'The input hidden states seems to be silently casted in float32, this might be related to'
-                f' the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in'
-                f' {target_dtype}.')
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-
-        attn_output = self._flash_attention_forward(query_states,
-                                                    key_states,
-                                                    value_states,
-                                                    attention_mask,
-                                                    q_len,
-                                                    dropout=dropout_rate)
-
-        attn_output = attn_output.reshape(bsz, q_len,
-                                          self.hidden_size).contiguous()
-        attn_output = self.wo(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value  # pylint: disable=E0606
-
-    def _flash_attention_forward(self,
-                                 query_states,
-                                 key_states,
-                                 value_states,
-                                 attention_mask,
-                                 query_length,
-                                 dropout=0.0,
-                                 softmax_scale=None):
-        """
-        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
-        first unpad the input, then computes the attention scores and pad the final attention scores.
-
-        Args:
-            query_states (`torch.Tensor`):
-                Input query states to be passed to Flash Attention API
-            key_states (`torch.Tensor`):
-                Input key states to be passed to Flash Attention API
-            value_states (`torch.Tensor`):
-                Input value states to be passed to Flash Attention API
-            attention_mask (`torch.Tensor`):
-                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
-                position of padding tokens and 1 for the position of non-padding tokens.
-            dropout (`float`):
-                Attention dropout
-            softmax_scale (`float`, *optional*):
-                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
-        """
-        if not self._flash_attn_uses_top_left_mask:
-            causal = self.is_causal
-        else:
-            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1.
-            # For details, please see the comment in InternLM2FlashAttention2 __init__.
-            causal = self.is_causal and query_length != 1
-
-        # Contains at least one padding token in the sequence
-        if attention_mask is not None:
-            batch_size = query_states.shape[0]
-            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
-                query_states, key_states, value_states, attention_mask,
-                query_length)
-
-            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-
-            attn_output_unpad = flash_attn_varlen_func(  # pylint: disable=E0606
-                query_states,
-                key_states,
-                value_states,
-                cu_seqlens_q=cu_seqlens_q,
-                cu_seqlens_k=cu_seqlens_k,
-                max_seqlen_q=max_seqlen_in_batch_q,
-                max_seqlen_k=max_seqlen_in_batch_k,
-                dropout_p=dropout,
-                softmax_scale=softmax_scale,
-                causal=causal,
-            )
-
-            attn_output = pad_input(attn_output_unpad, indices_q, batch_size,
-                                    query_length)  # pylint: disable=E0606
-        else:
-            attn_output = flash_attn_func(  # pylint: disable=E0606
-                query_states,
-                key_states,
-                value_states,
-                dropout,
-                softmax_scale=softmax_scale,
-                causal=causal)
-
-        return attn_output
-
-    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask,
-                    query_length):
-        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(
-            attention_mask)
-        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
-
-        key_layer = index_first_axis(  # pylint: disable=E0606
-            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads,
-                              head_dim), indices_k)
-        value_layer = index_first_axis(  # pylint: disable=E0606
-            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads,
-                                head_dim), indices_k)
-        if query_length == kv_seq_len:
-            query_layer = index_first_axis(  # pylint: disable=E0606
-                query_layer.reshape(batch_size * kv_seq_len, self.num_heads,
-                                    head_dim), indices_k)
-            cu_seqlens_q = cu_seqlens_k
-            max_seqlen_in_batch_q = max_seqlen_in_batch_k
-            indices_q = indices_k
-        elif query_length == 1:
-            max_seqlen_in_batch_q = 1
-            cu_seqlens_q = torch.arange(
-                batch_size + 1, dtype=torch.int32, device=query_layer.device
-            )  # There is a memcpy here, that is very bad.
-            indices_q = cu_seqlens_q[:-1]
-            query_layer = query_layer.squeeze(1)
-        else:
-            # The -q_len: slice assumes left padding.
-            attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(  # pylint: disable=E0606
-                query_layer, attention_mask)
-
-        return (
-            query_layer,
-            key_layer,
-            value_layer,
-            indices_q,
-            (cu_seqlens_q, cu_seqlens_k),
-            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
-        )
-
-
-# Copied from transformers.models.llama.modeling_llama.LllamaSdpaAttention with Llama->InternLM2
-class InternLM2SdpaAttention(InternLM2Attention):
-    """InternLM2 attention module using
-    torch.nn.functional.scaled_dot_product_attention.
-
-    This module inherits from `InternLM2Attention` as the weights of the module
-    stays untouched. The only changes are on the forward pass to adapt to SDPA
-    API.
-    """
-
-    # Adapted from InternLM2Attention.forward
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
-               Optional[Tuple[torch.Tensor]]]:
-        if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"`
-            # once this is implemented.
-            logger.warning_once(
-                'InternLM2Model uses InternLM2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` '
-                'does not support `output_attentions=True`. Falling back to the manual attention implementation, '
-                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. '
-                'This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-                cache_position=cache_position,
-            )
-
-        bsz, q_len, _ = hidden_states.size()
-
-        qkv_states = self.wqkv(hidden_states)
-
-        qkv_states = rearrange(
-            qkv_states,
-            'b q (h gs d) -> b q h gs d',
-            gs=2 + self.num_key_value_groups,
-            d=self.head_dim,
-        )
-
-        query_states = qkv_states[..., :self.num_key_value_groups, :]
-        query_states = rearrange(query_states, 'b q h gs d -> b q (h gs) d')
-        key_states = qkv_states[..., -2, :]
-        value_states = qkv_states[..., -1, :]
-
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        cos, sin = self.rotary_emb(value_states, position_ids)
-        query_states, key_states = apply_rotary_pos_emb(
-            query_states, key_states, cos, sin)
-
-        if past_key_value is not None:
-            # sin and cos are specific to RoPE models; cache_position needed for the static cache
-            cache_kwargs = {
-                'sin': sin,
-                'cos': cos,
-                'cache_position': cache_position
-            }
-            key_states, value_states = past_key_value.update(
-                key_states, value_states, self.layer_idx, cache_kwargs)
-
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        causal_mask = attention_mask
-        if attention_mask is not None:
-            causal_mask = causal_mask[:, :, :, :key_states.shape[-2]]
-
-        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with
-        # custom attn_mask, Reference: https://github.com/pytorch/pytorch/issues/112577.
-        if query_states.device.type == 'cuda' and causal_mask is not None:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of
-        # an inline conditional assignment in SDPA to support both torch.compile's dynamic shapes and full graph
-        # options. An inline conditional prevents dynamic shapes from compiling.
-        is_causal = bool(causal_mask is None and q_len > 1)
-
-        attn_output = torch.nn.functional.scaled_dot_product_attention(  # pylint: disable=E1102
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=causal_mask,
-            dropout_p=0.0,
-            is_causal=is_causal,
-        )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
-
-        attn_output = self.wo(attn_output)
-
-        return attn_output, None, past_key_value
-
-
-INTERNLM2_ATTENTION_CLASSES = {
-    'eager': InternLM2Attention,
-    'flash_attention_2': InternLM2FlashAttention2,
-    'sdpa': InternLM2SdpaAttention,
-}
-
-
-# Modified from transformers.models.llama.modeling_llama.LlamaDecoderLayer with Llama->InternLM2
-class InternLM2DecoderLayer(nn.Module):
-    """InternLM2 Decoder Layer.
-
-    This module is a single layer of the InternLM2 model.
-    """
-
-    def __init__(self, config: InternLM2Config, layer_idx: int):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.layer_idx = layer_idx
-
-        self.attention = INTERNLM2_ATTENTION_CLASSES[
-            config.attn_implementation](config=config, layer_idx=layer_idx)
-
-        self.feed_forward = InternLM2MLP(config)
-        self.attention_norm = InternLM2RMSNorm(config.hidden_size,
-                                               eps=config.rms_norm_eps)
-        self.ffn_norm = InternLM2RMSNorm(config.hidden_size,
-                                         eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor,
-                                                 torch.FloatTensor]]]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*):
-                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
-                query_sequence_length, key_sequence_length)` if default attention is used.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-        residual = hidden_states
-
-        hidden_states = self.attention_norm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.attention(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-            cache_position=cache_position,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.ffn_norm(hidden_states)
-        hidden_states = self.feed_forward(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states, )
-
-        if output_attentions:
-            outputs += (self_attn_weights, )
-
-        if use_cache:
-            outputs += (present_key_value, )
-
-        return outputs
-
-
-InternLM2_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`InternLM2Config`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel with Llama->InternLM2
-@add_start_docstrings(
-    'The bare InternLM2 Model outputting raw hidden-states without any specific head on top.',
-    InternLM2_START_DOCSTRING,
-)
-class InternLM2PreTrainedModel(PreTrainedModel):
-    """InternLM2 pretraiend model's base class."""
-
-    config_class = InternLM2Config
-    base_model_prefix = 'model'
-    supports_gradient_checkpointing = True
-    _no_split_modules = ['InternLM2DecoderLayer']
-    _skip_keys_device_placement = ['past_key_values']
-    _supports_flash_attn_2 = True
-    _supports_sdpa = True
-    _supports_cache_class = True
-    _supports_quantized_cache = True
-    _supports_static_cache = True
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-
-InternLM2_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
-            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
-            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
-
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance;
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
-
-            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
-            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
-            of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
-            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
-            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
-            the complete sequence length.
-"""
-
-
-# Modified from transformers.models.llama.modeling_llama.LlamaModel with Llama->InternLM2
-@add_start_docstrings(
-    'The bare InternLM2 Model outputting raw hidden-states without any specific head on top.',
-    InternLM2_START_DOCSTRING,
-)
-class InternLM2Model(InternLM2PreTrainedModel):
-    """Transformer decoder consisting of *config.num_hidden_layers* layers.
-    Each layer is a [`InternLM2DecoderLayer`]
-
-    Args:
-        config: InternLM2Config
-    """
-
-    _auto_class = 'AutoModel'
-
-    def __init__(self, config: InternLM2Config):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-        self.config = config
-
-        self.tok_embeddings = nn.Embedding(config.vocab_size,
-                                           config.hidden_size,
-                                           self.padding_idx)
-
-        self.layers = nn.ModuleList([
-            InternLM2DecoderLayer(config, layer_idx)
-            for layer_idx in range(config.num_hidden_layers)
-        ])
-        self.norm = InternLM2RMSNorm(config.hidden_size,
-                                     eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.tok_embeddings
-
-    def set_input_embeddings(self, value):
-        self.tok_embeddings = value
-
-    @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache,
-                                        List[torch.FloatTensor]]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                'You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one'
-            )
-
-        if self.gradient_checkpointing and self.training and use_cache:
-            logger.warning_once(
-                '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.'
-            )
-            use_cache = False
-
-        if inputs_embeds is None:
-            inputs_embeds = self.tok_embeddings(input_ids)
-
-        return_legacy_cache = False
-        if use_cache and not isinstance(
-                past_key_values,
-                Cache):  # kept for BC (non `Cache` `past_key_values` inputs)
-            return_legacy_cache = True
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-
-        if cache_position is None:
-            past_seen_tokens = past_key_values.get_seq_length(
-            ) if past_key_values is not None else 0
-            cache_position = torch.arange(past_seen_tokens,
-                                          past_seen_tokens +
-                                          inputs_embeds.shape[1],
-                                          device=inputs_embeds.device)
-        if position_ids is None:
-            position_ids = cache_position.unsqueeze(0)
-
-        causal_mask = self._update_causal_mask(attention_mask, inputs_embeds,
-                                               cache_position, past_key_values,
-                                               output_attentions)
-
-        # embed positions
-        hidden_states = inputs_embeds
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = None
-
-        for decoder_layer in self.layers:
-            if output_hidden_states:
-                all_hidden_states += (hidden_states, )
-
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
-                    hidden_states,
-                    causal_mask,
-                    position_ids,
-                    past_key_values,
-                    output_attentions,
-                    use_cache,
-                    cache_position,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=causal_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_values,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                    cache_position=cache_position,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache = layer_outputs[
-                    2 if output_attentions else 1]
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1], )
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states, )
-
-        next_cache = next_decoder_cache if use_cache else None
-        if return_legacy_cache:
-            next_cache = next_cache.to_legacy_cache()
-
-        if not return_dict:
-            return tuple(
-                v for v in
-                [hidden_states, next_cache, all_hidden_states, all_self_attns]
-                if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-    def _update_causal_mask(
-        self,
-        attention_mask: torch.Tensor,
-        input_tensor: torch.Tensor,
-        cache_position: torch.Tensor,
-        past_key_values: Cache,
-        output_attentions: bool,
-    ):
-        # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length
-        # even when the static KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at
-        # each decode steps due to the dynamic shapes. (`recording cudagraph tree for symint key 13`, etc.), which is
-        # VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using `fullgraph=True`.
-        # See more context in https://github.com/huggingface/transformers/pull/29114
-
-        if self.config.attn_implementation == 'flash_attention_2':
-            if attention_mask is not None and 0.0 in attention_mask:
-                return attention_mask
-            return None
-
-        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
-        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
-        # to infer the attention mask.
-        past_seen_tokens = past_key_values.get_seq_length(
-        ) if past_key_values is not None else 0
-        using_static_cache = isinstance(past_key_values, StaticCache)
-
-        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
-        if self.config.attn_implementation == 'sdpa' and not using_static_cache and not output_attentions:
-            if AttentionMaskConverter._ignore_causal_mask_sdpa(
-                    attention_mask,
-                    inputs_embeds=input_tensor,
-                    past_key_values_length=past_seen_tokens,
-                    is_training=self.training,
-            ):
-                return None
-
-        dtype, device = input_tensor.dtype, input_tensor.device
-        min_dtype = torch.finfo(dtype).min
-        sequence_length = input_tensor.shape[1]
-        if using_static_cache:
-            target_length = past_key_values.get_max_length()
-        else:
-            target_length = (attention_mask.shape[-1] if isinstance(
-                attention_mask, torch.Tensor) else past_seen_tokens +
-                             sequence_length + 1)
-
-        if attention_mask is not None and attention_mask.dim() == 4:
-            # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing
-            if attention_mask.max() != 0:
-                raise ValueError(
-                    'Custom 4D attention mask should be passed in inverted form with max==0`'
-                )
-            causal_mask = attention_mask
-        else:
-            causal_mask = torch.full((sequence_length, target_length),
-                                     fill_value=min_dtype,
-                                     dtype=dtype,
-                                     device=device)
-            if sequence_length != 1:
-                causal_mask = torch.triu(causal_mask, diagonal=1)
-            causal_mask *= torch.arange(
-                target_length, device=device) > cache_position.reshape(-1, 1)
-            causal_mask = causal_mask[None, None, :, :].expand(
-                input_tensor.shape[0], 1, -1, -1)
-            if attention_mask is not None:
-                causal_mask = causal_mask.clone(
-                )  # copy to contiguous memory for in-place edit
-                mask_length = attention_mask.shape[-1]
-                padding_mask = causal_mask[:, :, :, :
-                                           mask_length] + attention_mask[:,
-                                                                         None,
-                                                                         None, :]
-                padding_mask = padding_mask == 0
-                causal_mask[:, :, :, :
-                            mask_length] = causal_mask[:, :, :, :
-                                                       mask_length].masked_fill(
-                                                           padding_mask,
-                                                           min_dtype)
-        if (self.config.attn_implementation == 'sdpa'
-                and attention_mask is not None
-                and attention_mask.device.type == 'cuda'
-                and not output_attentions):
-            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
-            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
-            # Details: https://github.com/pytorch/pytorch/issues/110213
-            causal_mask = AttentionMaskConverter._unmask_unattended(
-                causal_mask, min_dtype)  # pylint: disable=E1120
-
-        return causal_mask
-
-
-# Modified from transformers.models.llama.modeling_llama.LlamaForCausalLM
-class InternLM2ForCausalLM(InternLM2PreTrainedModel):
-    """Causal language model (CLM) for InternLM2."""
-
-    _auto_class = 'AutoModelForCausalLM'
-    _tied_weights_keys = ['output.weight']
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = InternLM2Model(config)
-        self.vocab_size = config.vocab_size
-        self.output = nn.Linear(config.hidden_size,
-                                config.vocab_size,
-                                bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-        convert_to_qmodules(self)
-
-    def get_input_embeddings(self):
-        return self.model.tok_embeddings
-
-    def set_input_embeddings(self, value):
-        self.model.tok_embeddings = value
-
-    def get_output_embeddings(self):
-        return self.output
-
-    def set_output_embeddings(self, new_embeddings):
-        self.output = new_embeddings
-
-    def set_decoder(self, decoder):
-        self.model = decoder
-
-    def get_decoder(self):
-        return self.model
-
-    @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast,
-                               config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache,
-                                        List[torch.FloatTensor]]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, InternLM2ForCausalLM
-
-        >>> model = InternLM2ForCausalLM.from_pretrained("meta-InternLM2/InternLM2-2-7b-hf")
-        >>> tokenizer = AutoTokenizer.from_pretrained("meta-InternLM2/InternLM2-2-7b-hf")
-
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        ```"""
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            cache_position=cache_position,
-        )
-
-        hidden_states = outputs[0]
-        if self.config.pretraining_tp > 1:
-            output_slices = self.output.weight.split(
-                self.vocab_size // self.config.pretraining_tp, dim=0)
-            logits = [
-                F.linear(hidden_states, output_slices[i])  # pylint: disable=not-callable
-                for i in range(self.config.pretraining_tp)
-            ]
-            logits = torch.cat(logits, dim=-1)
-        else:
-            logits = self.output(hidden_states)
-        logits = logits.float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits, ) + outputs[1:]
-            return (loss, ) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self,
-        input_ids,
-        past_key_values=None,
-        attention_mask=None,
-        inputs_embeds=None,
-        cache_position=None,
-        use_cache=True,
-        **kwargs,
-    ):
-        past_length = 0
-        if past_key_values is not None:
-            if isinstance(past_key_values, Cache):
-                past_length = cache_position[
-                    0] if cache_position is not None else past_key_values.get_seq_length(
-                    )
-                max_cache_length = (torch.tensor(
-                    past_key_values.get_max_length(), device=input_ids.device)
-                                    if past_key_values.get_max_length()
-                                    is not None else None)
-                cache_length = past_length if max_cache_length is None else torch.min(
-                    max_cache_length, past_length)
-            # TODO joao: remove this `else` after `generate` prioritizes `Cache` objects
-            else:
-                cache_length = past_length = past_key_values[0][0].shape[2]
-                max_cache_length = None
-
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as input)
-            if attention_mask is not None and attention_mask.shape[
-                    1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] -
-                                           past_length):]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            elif past_length < input_ids.shape[1]:
-                input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-
-            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-            if (max_cache_length is not None and attention_mask is not None
-                    and cache_length + input_ids.shape[1] > max_cache_length):
-                attention_mask = attention_mask[:, -max_cache_length:]  # pylint: disable=E1130
-
-        position_ids = kwargs.get('position_ids', None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1]:]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {'inputs_embeds': inputs_embeds}
-        else:
-            # The `contiguous()` here is necessary to have a static stride during decoding. torchdynamo otherwise
-            # recompiles graphs as the stride of the inputs is a guard.
-            # Ref: https://github.com/huggingface/transformers/pull/29114
-            # TODO: use `next_tokens` directly instead.
-            model_inputs = {'input_ids': input_ids.contiguous()}
-
-        input_length = position_ids.shape[
-            -1] if position_ids is not None else input_ids.shape[-1]
-        if cache_position is None:
-            cache_position = torch.arange(past_length,
-                                          past_length + input_length,
-                                          device=input_ids.device)
-        elif use_cache:
-            cache_position = cache_position[-input_length:]
-
-        model_inputs.update({
-            'position_ids': position_ids,
-            'cache_position': cache_position,
-            'past_key_values': past_key_values,
-            'use_cache': use_cache,
-            'attention_mask': attention_mask,
-        })
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (tuple(
-                past_state.index_select(0, beam_idx.to(past_state.device))
-                for past_state in layer_past), )
-        return reordered_past
-
-    def build_inputs(self,
-                     tokenizer,
-                     query: str,
-                     history: List[Tuple[str, str]] = None,
-                     meta_instruction=''):
-        if history is None:
-            history = []
-        if tokenizer.add_bos_token:
-            prompt = ''
-        else:
-            prompt = tokenizer.bos_token
-        if meta_instruction:
-            prompt += f"""<|im_start|>system\n{meta_instruction}<|im_end|>\n"""
-        for record in history:
-            prompt += f"""<|im_start|>user\n{record[0]}<|im_end|>\n<|im_start|>assistant\n{record[1]}<|im_end|>\n"""
-        prompt += f"""<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n"""
-        return tokenizer([prompt], return_tensors='pt')
-
-    @torch.no_grad()
-    def chat(
-        self,
-        tokenizer,
-        query: str,
-        history: Optional[List[Tuple[str, str]]] = None,
-        streamer: Optional[BaseStreamer] = None,
-        max_new_tokens: int = 1024,
-        do_sample: bool = True,
-        temperature: float = 0.8,
-        top_p: float = 0.8,
-        meta_instruction:
-        str = 'You are an AI assistant whose name is InternLM (书生·浦语).\n'
-        '- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory '
-        '(上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n'
-        '- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such '
-        'as English and 中文.',
-        **kwargs,
-    ):
-        if history is None:
-            history = []
-        inputs = self.build_inputs(tokenizer, query, history, meta_instruction)
-        inputs = {
-            k: v.to(self.device)
-            for k, v in inputs.items() if torch.is_tensor(v)
-        }
-        # also add end-of-assistant token in eos token id to avoid unnecessary generation
-        eos_token_id = [
-            tokenizer.eos_token_id,
-            tokenizer.convert_tokens_to_ids(['<|im_end|>'])[0]
-        ]
-        outputs = self.generate(
-            **inputs,
-            streamer=streamer,
-            max_new_tokens=max_new_tokens,
-            do_sample=do_sample,
-            temperature=temperature,
-            top_p=top_p,
-            eos_token_id=eos_token_id,
-            **kwargs,
-        )
-        outputs = outputs[0].cpu().tolist()[len(inputs['input_ids'][0]):]
-        response = tokenizer.decode(outputs, skip_special_tokens=True)
-        response = response.split('<|im_end|>')[0]
-        history = history + [(query, response)]
-        return response, history
-
-    @torch.no_grad()
-    def stream_chat(
-        self,
-        tokenizer,
-        query: str,
-        history: List[Tuple[str, str]] = None,
-        max_new_tokens: int = 1024,
-        do_sample: bool = True,
-        temperature: float = 0.8,
-        top_p: float = 0.8,
-        **kwargs,
-    ):
-        if history is None:
-            history = []
-        """
-        Return a generator in format: (response, history)
-        Eg.
-        ('你好，有什么可以帮助您的吗', [('你好', '你好，有什么可以帮助您的吗')])
-        ('你好，有什么可以帮助您的吗？', [('你好', '你好，有什么可以帮助您的吗？')])
-        """
-        if BaseStreamer is None:
-            raise ModuleNotFoundError(
-                'The version of `transformers` is too low. Please make sure '
-                'that you have installed `transformers>=4.28.0`.')
-
-        response_queue = queue.Queue(maxsize=20)
-
-        class ChatStreamer(BaseStreamer):
-            """Streamer used in generate to print words one by one."""
-
-            def __init__(self, tokenizer) -> None:
-                super().__init__()
-                self.tokenizer = tokenizer
-                self.queue = response_queue
-                self.query = query
-                self.history = history
-                self.response = ''
-                self.cache = []
-                self.received_inputs = False
-                self.queue.put(
-                    (self.response, history + [(self.query, self.response)]))
-
-            def put(self, value):
-                if len(value.shape) > 1 and value.shape[0] > 1:
-                    raise ValueError('ChatStreamer only supports batch size 1')
-                elif len(value.shape) > 1:
-                    value = value[0]
-
-                if not self.received_inputs:
-                    # The first received value is input_ids, ignore here
-                    self.received_inputs = True
-                    return
-
-                self.cache.extend(value.tolist())
-                token = self.tokenizer.decode(self.cache,
-                                              skip_special_tokens=True)
-                if token.strip() != '<|im_end|>':
-                    self.response = self.response + token
-                    history = self.history + [(self.query, self.response)]
-                    self.queue.put((self.response, history))
-                    self.cache = []
-                else:
-                    self.end()
-
-            def end(self):
-                self.queue.put(None)
-
-        def stream_producer():
-            return self.chat(
-                tokenizer=tokenizer,
-                query=query,
-                streamer=ChatStreamer(tokenizer=tokenizer),
-                history=history,
-                max_new_tokens=max_new_tokens,
-                do_sample=do_sample,
-                temperature=temperature,
-                top_p=top_p,
-                **kwargs,
-            )
-
-        def consumer():
-            producer = threading.Thread(target=stream_producer)
-            producer.start()
-            while True:
-                res = response_queue.get()
-                if res is None:
-                    return
-                yield res
-
-        return consumer()
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->InternLM2
-@add_start_docstrings(
-    """
-    The InternLM2 Model transformer with a sequence classification head on top (linear layer).
-
-    [`InternLM2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    InternLM2_START_DOCSTRING,
-)
-class InternLM2ForSequenceClassification(InternLM2PreTrainedModel):
-    """Sequence Classification Head for InternLM2 Model."""
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = InternLM2Model(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.tok_embeddings
-
-    def set_input_embeddings(self, value):
-        self.model.tok_embeddings = value
-
-    @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache,
-                                        List[torch.FloatTensor]]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError(
-                'Cannot handle batch sizes > 1 if no padding token is defined.'
-            )
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
-                sequence_lengths = torch.eq(
-                    input_ids, self.config.pad_token_id).int().argmax(-1) - 1
-                sequence_lengths = sequence_lengths % input_ids.shape[-1]
-                sequence_lengths = sequence_lengths.to(logits.device)
-            else:
-                sequence_lengths = -1
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device),
-                               sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = 'regression'
-                elif self.num_labels > 1 and (labels.dtype
-                                              in (torch.long, torch.int)):
-                    self.config.problem_type = 'single_label_classification'
-                else:
-                    self.config.problem_type = 'multi_label_classification'
-
-            if self.config.problem_type == 'regression':
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == 'single_label_classification':
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels),
-                                labels.view(-1))
-            elif self.config.problem_type == 'multi_label_classification':
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits, ) + transformer_outputs[1:]
-            return ((loss, ) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaForQuestionAnswering with Llama->InternLM2
-@add_start_docstrings(
-    """
-The InternLM2 Model transformer with a span classification head on top for extractive question-answering tasks like
-SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
-    """,
-    InternLM2_START_DOCSTRING,
-)
-class InternLM2ForQuestionAnswering(InternLM2PreTrainedModel):
-    """Question Answering model for InternLM2."""
-
-    base_model_prefix = 'transformer'
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.transformer = InternLM2Model(config)
-        self.qa_outputs = nn.Linear(config.hidden_size, 2)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.transformer.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.transformer.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache,
-                                        List[torch.FloatTensor]]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        start_positions: Optional[torch.LongTensor] = None,
-        end_positions: Optional[torch.LongTensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
-        r"""
-        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            are not taken into account for computing the loss.
-        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            are not taken into account for computing the loss.
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.transformer(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = outputs[0]
-
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1).contiguous()
-        end_logits = end_logits.squeeze(-1).contiguous()
-
-        total_loss = None
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1).to(
-                    start_logits.device)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1).to(end_logits.device)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions = start_positions.clamp(0, ignored_index)
-            end_positions = end_positions.clamp(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-
-        if not return_dict:
-            output = (start_logits, end_logits) + outputs[2:]
-            return ((total_loss, ) +
-                    output) if total_loss is not None else output
-
-        return QuestionAnsweringModelOutput(
-            loss=total_loss,
-            start_logits=start_logits,
-            end_logits=end_logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaForTokenClassification with Llama->InternLM2
-@add_start_docstrings(
-    """
-    The InternLM2 Model transformer with a token classification head on top (a linear layer on top of the hidden-states
-    output) e.g. for Named-Entity-Recognition (NER) tasks.
-    """,
-    InternLM2_START_DOCSTRING,
-)
-class InternLM2ForTokenClassification(InternLM2PreTrainedModel):
-    """Token classification model for InternLM2."""
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = InternLM2Model(config)
-        if getattr(config, 'classifier_dropout', None) is not None:
-            classifier_dropout = config.classifier_dropout
-        elif getattr(config, 'hidden_dropout', None) is not None:
-            classifier_dropout = config.hidden_dropout
-        else:
-            classifier_dropout = 0.1
-        self.dropout = nn.Dropout(classifier_dropout)
-        self.score = nn.Linear(config.hidden_size, config.num_labels)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        sequence_output = outputs[0]
-        sequence_output = self.dropout(sequence_output)
-        logits = self.score(sequence_output)
-
-        loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-
-        if not return_dict:
-            output = (logits, ) + outputs[2:]
-            return ((loss, ) + output) if loss is not None else output
-
-        return TokenClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
diff --git a/lmdeploy/pytorch/modeling/modeling_llama.py b/lmdeploy/pytorch/modeling/modeling_llama.py
deleted file mode 100644
index c37d1e9aee..0000000000
--- a/lmdeploy/pytorch/modeling/modeling_llama.py
+++ /dev/null
@@ -1,1297 +0,0 @@
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch LLaMA model."""
-import math
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-from transformers.activations import ACT2FN
-from transformers.modeling_outputs import (BaseModelOutputWithPast,
-                                           CausalLMOutputWithPast,
-                                           SequenceClassifierOutputWithPast)
-from transformers.modeling_utils import PreTrainedModel
-from transformers.models.llama.configuration_llama import LlamaConfig
-from transformers.utils import (add_start_docstrings,
-                                add_start_docstrings_to_model_forward,
-                                replace_return_docstrings)
-
-from lmdeploy.pytorch.modeling.convert_to_qmodules import convert_to_qmodules
-from lmdeploy.utils import get_logger
-
-logger = get_logger('lmdeploy')
-
-_CONFIG_FOR_DOC = 'LlamaConfig'
-
-
-# Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(input_ids_shape: torch.Size,
-                      dtype: torch.dtype,
-                      device: torch.device,
-                      past_key_values_length: int = 0):
-    """Make causal mask used for bi-directional self-attention."""
-    bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len),
-                      torch.finfo(dtype).min,
-                      device=device)
-    mask_cond = torch.arange(mask.size(-1), device=device)
-    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
-    mask = mask.to(dtype)
-
-    if past_key_values_length > 0:
-        mask = torch.cat([
-            torch.zeros(
-                tgt_len, past_key_values_length, dtype=dtype, device=device),
-            mask
-        ],
-                         dim=-1)
-    return mask[None, None, :, :].expand(bsz, 1, tgt_len,
-                                         tgt_len + past_key_values_length)
-
-
-# Copied from transformers.models.bart.modeling_bart._expand_mask
-def _expand_mask(mask: torch.Tensor,
-                 dtype: torch.dtype,
-                 tgt_len: Optional[int] = None):
-    """Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len,
-    src_seq_len]`."""
-    bsz, src_len = mask.size()
-    tgt_len = tgt_len if tgt_len is not None else src_len
-
-    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len,
-                                                  src_len).to(dtype)
-
-    inverted_mask = 1.0 - expanded_mask
-
-    return inverted_mask.masked_fill(inverted_mask.to(torch.bool),
-                                     torch.finfo(dtype).min)
-
-
-class LlamaRMSNorm(nn.Module):
-
-    def __init__(self, hidden_size, eps=1e-6):
-        """LlamaRMSNorm is equivalent to T5LayerNorm."""
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance +
-                                                    self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-
-class LlamaRotaryEmbedding(torch.nn.Module):
-    """RotaryEmbedding for Llama Model.
-
-    This module generates sine and cosine positional encodings based on
-    the paper "RoFormer: Enhanced Transformer with Rotary Position Embedding".
-    The purpose of this class is to provide positional embeddings to the
-    input tensors. It utilizes a cache mechanism to store precomputed
-    sine and cosine values for speedup.
-
-    Args:
-        dim (int): The dimensionality of the embeddings.
-        max_position_embeddings (int, optional): The maximum number of
-            position embeddings. Default is 2048.
-        base (int, optional): The base value for the inverse frequency
-            calculation. Default is 10000.
-        device (str, optional): The device to run operations on.
-            If None, defaults to the device of the model.
-    """
-
-    def __init__(self,
-                 dim,
-                 max_position_embeddings=2048,
-                 base=10000,
-                 device=None):
-        super().__init__()
-
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base**(
-            torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-        self.register_buffer('inv_freq', inv_freq, persistent=False)
-
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(seq_len=max_position_embeddings,
-                                device=self.inv_freq.device,
-                                dtype=torch.get_default_dtype())
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        """Sets the cached sine and cosine values for the specified sequence
-        length.
-
-        Args:
-            seq_len (int): The sequence length for which to set the cache.
-            device (str): The device to use for computation.
-            dtype (torch.dtype): The data type to be used for tensors.
-        """
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached,
-                         device=device,
-                         dtype=self.inv_freq.dtype)
-
-        freqs = torch.einsum('i,j->ij', t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order
-        # to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer('cos_cached',
-                             emb.cos()[None, None, :, :].to(dtype),
-                             persistent=False)
-        self.register_buffer('sin_cached',
-                             emb.sin()[None, None, :, :].to(dtype),
-                             persistent=False)
-
-    def forward(self, x, seq_len=None):
-        """Forward propagation method for the embedding layer. Generates
-        positional embeddings for the given input tensor.
-
-        If the sequence length is larger than the cache, it resets the cache.
-
-        Args:
-            x (torch.Tensor): Input tensor of shape
-                [batch_size, num_attention_heads, seq_len, head_size].
-            seq_len (int, optional): Sequence length. If None, it is obtained
-                from `x`.
-
-        Returns:
-            tuple: Tuple containing cosine and sine positional embeddings.
-        """
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len,
-                                    device=x.device,
-                                    dtype=x.dtype)
-
-        return (
-            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-        )
-
-
-class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
-    """This class extends the `LlamaRotaryEmbedding` with linear scaling.
-
-    It provides a mechanism for adjusting the scale of the positional
-    embeddings by dividing the tensor generated by the range of sequence length
-    with a scaling factor. This is useful when dealing with sequences of
-    varying lengths.
-
-    Credits to Reddit User /u/kaiokendev for this extension.
-
-    Args:
-        dim (int): The dimensionality of the embeddings.
-        max_position_embeddings (int, optional): The maximum number of
-            position embeddings. Default is 2048.
-        base (int, optional): The base value for the inverse frequency
-            calculation. Default is 10000.
-        device (str, optional): The device to run operations on. If None,
-            defaults to the device of the model.
-        scaling_factor (float, optional): Scaling factor used in adjusting
-            the scale of positional embeddings. Default is 1.0.
-    """
-
-    def __init__(self,
-                 dim,
-                 max_position_embeddings=2048,
-                 base=10000,
-                 device=None,
-                 scaling_factor=1.0):
-        self.scaling_factor = scaling_factor
-        super().__init__(dim, max_position_embeddings, base, device)
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        """Sets the cached sine and cosine values for the specified sequence
-        length.
-
-        Args:
-            seq_len (int): The sequence length for which to set the cache.
-            device (str): The device to use for computation.
-            dtype (torch.dtype): The data type to use for tensors.
-        """
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached,
-                         device=device,
-                         dtype=self.inv_freq.dtype)
-        t = t / self.scaling_factor
-
-        freqs = torch.einsum('i,j->ij', t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order
-        # to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer('cos_cached',
-                             emb.cos()[None, None, :, :].to(dtype),
-                             persistent=False)
-        self.register_buffer('sin_cached',
-                             emb.sin()[None, None, :, :].to(dtype),
-                             persistent=False)
-
-
-class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
-    """LlamaRotaryEmbedding extended with Dynamic NTK scaling.
-
-    Credits to the Reddit users /u/bloc97 and /u/emozilla
-    """
-
-    def __init__(self,
-                 dim,
-                 max_position_embeddings=2048,
-                 base=10000,
-                 device=None,
-                 scaling_factor=1.0):
-        self.scaling_factor = scaling_factor
-        super().__init__(dim, max_position_embeddings, base, device)
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-
-        if seq_len > self.max_position_embeddings:
-            base = self.base * ((self.scaling_factor * seq_len /
-                                 self.max_position_embeddings) -
-                                (self.scaling_factor - 1))**(self.dim /
-                                                             (self.dim - 2))
-            inv_freq = 1.0 / (base**(
-                torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-            self.register_buffer('inv_freq', inv_freq, persistent=False)
-
-        t = torch.arange(self.max_seq_len_cached,
-                         device=device,
-                         dtype=self.inv_freq.dtype)
-
-        freqs = torch.einsum('i,j->ij', t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order
-        # to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer('cos_cached',
-                             emb.cos()[None, None, :, :].to(dtype),
-                             persistent=False)
-        self.register_buffer('sin_cached',
-                             emb.sin()[None, None, :, :].to(dtype),
-                             persistent=False)
-
-
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., :x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2:]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
-    """Apply rotary positional embeddings to query and key tensors.
-
-    This function applies the cosine and sine positional embeddings on the
-    input query (q) and key (k) tensors using element-wise multiplication and
-    addition.
-    """
-    # The first two dimensions of cos and sin are always 1,
-    # so we can `squeeze` them.
-    cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
-    sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
-    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-class LlamaMLP(nn.Module):
-    """MLP for Llama Model."""
-
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size,
-                                   self.intermediate_size,
-                                   bias=False)
-        self.up_proj = nn.Linear(self.hidden_size,
-                                 self.intermediate_size,
-                                 bias=False)
-        self.down_proj = nn.Linear(self.intermediate_size,
-                                   self.hidden_size,
-                                   bias=False)
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        if self.config.pretraining_tp > 1:
-            slice = self.intermediate_size // self.config.pretraining_tp
-            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
-            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
-            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
-
-            gate_proj = torch.cat([
-                F.linear(x, gate_proj_slices[i])
-                for i in range(self.config.pretraining_tp)
-            ],
-                                  dim=-1)
-            up_proj = torch.cat([
-                F.linear(x, up_proj_slices[i])
-                for i in range(self.config.pretraining_tp)
-            ],
-                                dim=-1)
-
-            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(
-                slice, dim=2)
-            down_proj = [
-                F.linear(intermediate_states[i], down_proj_slices[i])
-                for i in range(self.config.pretraining_tp)
-            ]
-            down_proj = sum(down_proj)
-        else:
-            down_proj = self.down_proj(
-                self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-        return down_proj
-
-
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """This is the equivalent of torch.repeat_interleave(x, dim=1,
-    repeats=n_rep).
-
-    The hidden states go from (batch, num_key_value_heads, seqlen, head_dim) to
-    (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :,
-                                  None, :, :].expand(batch,
-                                                     num_key_value_heads,
-                                                     n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen,
-                                 head_dim)
-
-
-class LlamaAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper."""
-
-    def __init__(self, config: LlamaConfig):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError('hidden_size must be divisible by num_heads '
-                             f'(got `hidden_size`: {self.hidden_size}'
-                             f' and `num_heads`: {self.num_heads}).')
-        self.q_proj = nn.Linear(self.hidden_size,
-                                self.num_heads * self.head_dim,
-                                bias=False)
-        self.k_proj = nn.Linear(self.hidden_size,
-                                self.num_key_value_heads * self.head_dim,
-                                bias=False)
-        self.v_proj = nn.Linear(self.hidden_size,
-                                self.num_key_value_heads * self.head_dim,
-                                bias=False)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim,
-                                self.hidden_size,
-                                bias=False)
-        self._init_rope()
-
-    def _init_rope(self):
-        """Initialize the Rotary Embedding Module."""
-        if self.config.rope_scaling is None:
-            self.rotary_emb = LlamaRotaryEmbedding(
-                self.head_dim,
-                max_position_embeddings=self.max_position_embeddings,
-                base=self.rope_theta,
-            )
-        else:
-            scaling_type = self.config.rope_scaling['type']
-            scaling_factor = self.config.rope_scaling['factor']
-            if scaling_type == 'linear':
-                self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
-                    self.head_dim,
-                    max_position_embeddings=self.max_position_embeddings,
-                    scaling_factor=scaling_factor,
-                    base=self.rope_theta,
-                )
-            elif scaling_type == 'dynamic':
-                self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
-                    self.head_dim,
-                    max_position_embeddings=self.max_position_embeddings,
-                    scaling_factor=scaling_factor,
-                    base=self.rope_theta,
-                )
-            else:
-                raise ValueError(f'Unknown RoPE scaling type {scaling_type}')
-
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads,
-                           self.head_dim).transpose(1, 2).contiguous()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
-               Optional[Tuple[torch.Tensor]]]:
-        """Forward propagation method for the attention layer."""
-        bsz, q_len, _ = hidden_states.size()
-
-        if self.config.pretraining_tp > 1:
-            key_value_slicing = (self.num_key_value_heads *
-                                 self.head_dim) // self.config.pretraining_tp
-            query_slices = self.q_proj.weight.split(
-                (self.num_heads * self.head_dim) // self.config.pretraining_tp,
-                dim=0)
-            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
-            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
-
-            query_states = [
-                F.linear(hidden_states, query_slices[i])
-                for i in range(self.config.pretraining_tp)
-            ]
-            query_states = torch.cat(query_states, dim=-1)
-
-            key_states = [
-                F.linear(hidden_states, key_slices[i])
-                for i in range(self.config.pretraining_tp)
-            ]
-            key_states = torch.cat(key_states, dim=-1)
-
-            value_states = [
-                F.linear(hidden_states, value_slices[i])
-                for i in range(self.config.pretraining_tp)
-            ]
-            value_states = torch.cat(value_states, dim=-1)
-
-        else:
-            query_states = self.q_proj(hidden_states)
-            key_states = self.k_proj(hidden_states)
-            value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads,
-                                         self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads,
-                                     self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads,
-                                         self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(
-            query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-        past_key_value = (key_states, value_states) if use_cache else None
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        attn_weights = torch.matmul(query_states, key_states.transpose(
-            2, 3)) / math.sqrt(self.head_dim)
-
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                'Attention weights should be of size '
-                f'{(bsz, self.num_heads, q_len, kv_seq_len)}, but is'
-                f' {attn_weights.size()}')
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError('Attention mask should be of size '
-                                 f'{(bsz, 1, q_len, kv_seq_len)}, '
-                                 f'but is {attention_mask.size()}')
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights,
-                                             dim=-1,
-                                             dtype=torch.float32).to(
-                                                 query_states.dtype)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                '`attn_output` should be of size '
-                f'{(bsz, self.num_heads, q_len, self.head_dim)}, but is'
-                f' {attn_output.size()}')
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        if self.config.pretraining_tp > 1:
-            attn_output = attn_output.split(self.hidden_size //
-                                            self.config.pretraining_tp,
-                                            dim=2)
-            o_proj_slices = self.o_proj.weight.split(
-                self.hidden_size // self.config.pretraining_tp, dim=1)
-            attn_output = sum([
-                F.linear(attn_output[i], o_proj_slices[i])
-                for i in range(self.config.pretraining_tp)
-            ])
-        else:
-            attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class LlamaDecoderLayer(nn.Module):
-    """Decoder layer for Llama Model."""
-
-    def __init__(self, config: LlamaConfig):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.self_attn = LlamaAttention(config=config)
-        self.mlp = LlamaMLP(config)
-        self.input_layernorm = LlamaRMSNorm(config.hidden_size,
-                                            eps=config.rms_norm_eps)
-        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size,
-                                                     eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor,
-                                                 torch.FloatTensor]]]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape
-                `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask
-                of size `(batch, 1, tgt_len, src_len)` where padding elements
-                are indicated by very large negative values.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all
-                attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are
-                returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached
-                past key and value projection states
-        """
-
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states, )
-
-        if output_attentions:
-            outputs += (self_attn_weights, )
-
-        if use_cache:
-            outputs += (present_key_value, )
-
-        return outputs
-
-
-LLAMA_START_DOCSTRING = r"""    # noqa: E501
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`LlamaConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    'The bare LLaMA Model outputting raw hidden-states without any specific head on top.',  # noqa: E501
-    LLAMA_START_DOCSTRING,
-)
-class LlamaPreTrainedModel(PreTrainedModel):
-    config_class = LlamaConfig
-    base_model_prefix = 'model'
-    supports_gradient_checkpointing = True
-    _no_split_modules = ['LlamaDecoderLayer']
-    _skip_keys_device_placement = 'past_key_values'
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, LlamaModel):
-            module.gradient_checkpointing = value
-
-
-LLAMA_INPUTS_DOCSTRING = r"""    # noqa: E501
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    'The bare LLaMA Model outputting raw hidden-states without any specific head on top.',  # noqa: E501
-    LLAMA_START_DOCSTRING,
-)
-class LlamaModel(LlamaPreTrainedModel):
-    """Transformer decoder consisting of *config.num_hidden_layers* layers.
-    Each layer is a [`LlamaDecoderLayer`]
-
-    Args:
-        config: LlamaConfig
-    """
-
-    def __init__(self, config: LlamaConfig):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size,
-                                         self.padding_idx)
-        self.layers = nn.ModuleList([
-            LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)
-        ])
-        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask    # noqa
-    def _prepare_decoder_attention_mask(self, attention_mask, input_shape,
-                                        inputs_embeds, past_key_values_length):
-        # create causal mask
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        combined_attention_mask = None
-        if input_shape[-1] > 1:
-            combined_attention_mask = _make_causal_mask(
-                input_shape,
-                inputs_embeds.dtype,
-                device=inputs_embeds.device,
-                past_key_values_length=past_key_values_length,
-            )
-
-        if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(attention_mask,
-                                              inputs_embeds.dtype,
-                                              tgt_len=input_shape[-1]).to(
-                                                  inputs_embeds.device)
-            combined_attention_mask = (expanded_attn_mask
-                                       if combined_attention_mask is None else
-                                       expanded_attn_mask +
-                                       combined_attention_mask)
-
-        return combined_attention_mask
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = (output_attentions if output_attentions is not None
-                             else self.config.output_attentions)
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        use_cache = (use_cache
-                     if use_cache is not None else self.config.use_cache)
-
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError('You cannot specify both decoder_input_ids'
-                             'and decoder_inputs_embeds at the same time')
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError('You have to specify either decoder_input_ids'
-                             'or decoder_inputs_embeds')
-
-        seq_length_with_past = seq_length
-        past_key_values_length = 0
-
-        if past_key_values is not None:
-            past_key_values_length = past_key_values[0][0].shape[2]
-            seq_length_with_past = (seq_length_with_past +
-                                    past_key_values_length)
-
-        if position_ids is None:
-            device = (input_ids.device
-                      if input_ids is not None else inputs_embeds.device)
-            position_ids = torch.arange(past_key_values_length,
-                                        seq_length + past_key_values_length,
-                                        dtype=torch.long,
-                                        device=device)
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-        # embed positions
-        if attention_mask is None:
-            attention_mask = torch.ones((batch_size, seq_length_with_past),
-                                        dtype=torch.bool,
-                                        device=inputs_embeds.device)
-        attention_mask = self._prepare_decoder_attention_mask(
-            attention_mask, (batch_size, seq_length), inputs_embeds,
-            past_key_values_length)
-
-        hidden_states = inputs_embeds
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    '`use_cache=True` is incompatible with gradient'
-                    ' checkpointing. Setting `use_cache=False`...')
-                use_cache = False
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = () if use_cache else None
-
-        for idx, decoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                all_hidden_states += (hidden_states, )
-
-            past_key_value = past_key_values[
-                idx] if past_key_values is not None else None
-
-            if self.gradient_checkpointing and self.training:
-
-                def create_custom_forward(module):
-
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs, past_key_value,
-                                      output_attentions)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(decoder_layer),
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_value,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache += (
-                    layer_outputs[2 if output_attentions else 1], )
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1], )
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states, )
-
-        next_cache = next_decoder_cache if use_cache else None
-        if not return_dict:
-            return tuple(
-                v for v in
-                [hidden_states, next_cache, all_hidden_states, all_self_attns]
-                if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class LlamaForCausalLM(LlamaPreTrainedModel):
-    """This class extends the `LlamaPreTrainedModel` to enable causal language
-    modeling.
-
-    It wraps the basic Llama model (`LlamaModel`) and includes a linear layer
-    as a language model head (`lm_head`). The purpose is to predict token
-    probabilities, given the previous tokens in the sequence.
-    """
-    _tied_weights_keys = ['lm_head.weight']
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = LlamaModel(config)
-        self.vocab_size = config.vocab_size
-        self.lm_head = nn.Linear(config.hidden_size,
-                                 config.vocab_size,
-                                 bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-        convert_to_qmodules(self)
-
-    def get_input_embeddings(self):
-        """Get the token embedding layer."""
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        """Set the token embedding layer."""
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        """Get the output embedding layer."""
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        """Set the output embedding layer."""
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        """Set the decoder model."""
-        self.model = decoder
-
-    def get_decoder(self):
-        """Get the decoder model."""
-        return self.model
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast,
-                               config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r""" # noqa: E501
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, LlamaForCausalLM
-
-        >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
-        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
-
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        ```"""
-
-        output_attentions = (output_attentions if output_attentions is not None
-                             else self.config.output_attentions)
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) # noqa: E501
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        if self.config.pretraining_tp > 1:
-            lm_head_slices = self.lm_head.weight.split(
-                self.vocab_size // self.config.pretraining_tp, dim=0)
-            logits = [
-                F.linear(hidden_states, lm_head_slices[i])
-                for i in range(self.config.pretraining_tp)
-            ]
-            logits = torch.cat(logits, dim=-1)
-        else:
-            logits = self.lm_head(hidden_states)
-        logits = logits.float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits, ) + outputs[1:]
-            return (loss, ) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(self,
-                                      input_ids,
-                                      past_key_values=None,
-                                      attention_mask=None,
-                                      inputs_embeds=None,
-                                      **kwargs):
-        """Prepare inputs for generating sequences using the model.
-
-        Args:
-            input_ids (torch.Tensor): Input token ids.
-            past_key_values (list[torch.Tensor], optional): List of past key
-                and value states.
-            attention_mask (torch.Tensor, optional): Mask indicating which
-                tokens should be attended to.
-            inputs_embeds (torch.FloatTensor, optional): Optionally,
-                the input embeddings instead of token ids.
-
-        Returns:
-            dict: Dictionary containing prepared inputs for model generation.
-        """
-        if past_key_values:
-            input_ids = input_ids[:, -1:]
-
-        position_ids = kwargs.get('position_ids', None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -1].unsqueeze(-1)
-
-        # if `inputs_embeds` are passed, we only want to use them
-        # in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {'inputs_embeds': inputs_embeds}
-        else:
-            model_inputs = {'input_ids': input_ids}
-
-        model_inputs.update({
-            'position_ids': position_ids,
-            'past_key_values': past_key_values,
-            'use_cache': kwargs.get('use_cache'),
-            'attention_mask': attention_mask,
-        })
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        """Reorder cached past key-values during generation using beam search.
-
-        This function reorders the cached past key-values according to the
-        given indices. It's useful in beam search where the order of hypotheses
-        can change from one time-step to another.
-        """
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (tuple(
-                past_state.index_select(0, beam_idx.to(past_state.device))
-                for past_state in layer_past), )
-        return reordered_past
-
-
-@add_start_docstrings(
-    """ # noqa: E501
-    The LLaMa Model transformer with a sequence classification head on top (linear layer).
-
-    [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    LLAMA_START_DOCSTRING,
-)
-class LlamaForSequenceClassification(LlamaPreTrainedModel):
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = LlamaModel(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r""" # noqa: E501
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
-
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError(
-                'Cannot handle batch sizes > 1 if no padding token is defined.'
-            )
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                sequence_lengths = (torch.eq(
-                    input_ids, self.config.pad_token_id).long().argmax(-1) -
-                                    1).to(logits.device)
-            else:
-                sequence_lengths = -1
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device),
-                               sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = 'regression'
-                elif self.num_labels > 1 and (labels.dtype == torch.long
-                                              or labels.dtype == torch.int):
-                    self.config.problem_type = 'single_label_classification'
-                else:
-                    self.config.problem_type = 'multi_label_classification'
-
-            if self.config.problem_type == 'regression':
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == 'single_label_classification':
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels),
-                                labels.view(-1))
-            elif self.config.problem_type == 'multi_label_classification':
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits, ) + transformer_outputs[1:]
-            return ((loss, ) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )

From 9bfdeaece9b5c6138a46f662f090e3e7ee926ea5 Mon Sep 17 00:00:00 2001
From: tangzhiyi11 <tangzhiyi11@users.noreply.github.com>
Date: Wed, 4 Dec 2024 19:48:31 +0800
Subject: [PATCH 40/40] convert kv cache to nd format in ascend graph mode
 (#2853)

---
 .../backends/dlinfer/ascend/graph_runner.py   | 33 +++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/graph_runner.py b/lmdeploy/pytorch/backends/dlinfer/ascend/graph_runner.py
index b69cb1dca5..f9664f13ff 100644
--- a/lmdeploy/pytorch/backends/dlinfer/ascend/graph_runner.py
+++ b/lmdeploy/pytorch/backends/dlinfer/ascend/graph_runner.py
@@ -1,15 +1,20 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import warnings
 from importlib import import_module
+from typing import List
 
 import torch
 import torch.distributed
+import torch_npu
 
 from lmdeploy.pytorch.config import BackendConfig, CacheConfig, ModelConfig
+from lmdeploy.pytorch.model_inputs import StepContext
 from lmdeploy.utils import get_logger
 
 from ...graph_runner import GraphRunner
 
+ACL_FORMAT_ND = 2
+
 logger = get_logger('lmdeploy')
 
 
@@ -110,3 +115,31 @@ def allocate_gpu_cache_mark_static(self):
             return gpu_cache
 
         setattr(cache_engine_class, func_str, allocate_gpu_cache_mark_static)
+
+    def _convert_kv_format(self,
+                           past_key_values: List[List[torch.Tensor]]) -> None:
+        """Convert key/value caches to ACL_FORMAT_ND format if needed."""
+        # Check format of first KV cache
+        if torch_npu.get_npu_format(past_key_values[0][0]) == ACL_FORMAT_ND:
+            return
+
+        # Convert all KV caches to ACL_FORMAT_ND
+        for layer_kv in past_key_values:
+            key_cache, value_cache = layer_kv
+            torch_npu.npu_format_cast(key_cache, ACL_FORMAT_ND)
+            torch_npu.npu_format_cast(value_cache, ACL_FORMAT_ND)
+
+    def prepare_inputs_for_generation(
+        self,
+        past_key_values: List[List[torch.Tensor]],
+        inputs_embeds: torch.Tensor = None,
+        context: StepContext = None,
+    ):
+        """prepare inputs."""
+        if self.enable_graph:
+            self._convert_kv_format(past_key_values)
+        return self.model.prepare_inputs_for_generation(
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            context=context,
+        )