Skip to content

Commit

Permalink
support qwen2-vl with turbomind backend
Browse files Browse the repository at this point in the history
  • Loading branch information
irexyc committed Nov 5, 2024
1 parent 71f1d0f commit 90a7a24
Show file tree
Hide file tree
Showing 21 changed files with 319 additions and 23 deletions.
2 changes: 1 addition & 1 deletion docs/en/multi_modal/qwen2_vl.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ LMDeploy supports the following Qwen-VL series of models, which are detailed in
| Model | Size | Supported Inference Engine |
| :----------: | :----: | :------------------------: |
| Qwen-VL-Chat | - | TurboMind, Pytorch |
| Qwen2-VL | 2B, 7B | PyTorch |
| Qwen2-VL | 2B-72B | TurboMind, PyTorch |

The next chapter demonstrates how to deploy an Qwen-VL model using LMDeploy, with [Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct) as an example.

Expand Down
1 change: 1 addition & 0 deletions docs/en/supported_models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
| Qwen | 1.8B - 72B | LLM | Yes | Yes | Yes | Yes |
| Qwen1.5 | 1.8B - 110B | LLM | Yes | Yes | Yes | Yes |
| Qwen2 | 1.5B - 72B | LLM | Yes | Yes | Yes | Yes |
| QWen2-VL | 2B- 72B | MLLM | Yes | Yes | Yes | Yes |
| Mistral | 7B | LLM | Yes | Yes | Yes | Yes |
| Mixtral | 8x7B, 8x22B | LLM | Yes | Yes | Yes | Yes |
| Qwen-VL | 7B | MLLM | Yes | Yes | Yes | Yes |
Expand Down
2 changes: 1 addition & 1 deletion docs/zh_cn/multi_modal/qwen2_vl.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ LMDeploy 支持 Qwen-VL 系列模型,具体如下:
| Model | Size | Supported Inference Engine |
| :----------: | :----: | :------------------------: |
| Qwen-VL-Chat | - | TurboMind, Pytorch |
| Qwen2-VL | 2B, 7B | PyTorch |
| Qwen2-VL | 2B-72B | TurboMind,PyTorch |

本文将以[Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)为例,演示使用 LMDeploy 部署 Qwen2-VL 系列模型的方法

Expand Down
1 change: 1 addition & 0 deletions docs/zh_cn/supported_models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
| Qwen | 1.8B - 72B | LLM | Yes | Yes | Yes | Yes |
| Qwen1.5 | 1.8B - 110B | LLM | Yes | Yes | Yes | Yes |
| Qwen2 | 1.5B - 72B | LLM | Yes | Yes | Yes | Yes |
| QWen2-VL | 2B- 72B | MLLM | Yes | Yes | Yes | Yes |
| Mistral | 7B | LLM | Yes | Yes | Yes | Yes |
| Mixtral | 8x7B, 8x22B | LLM | Yes | Yes | Yes | Yes |
| Qwen-VL | 7B | MLLM | Yes | Yes | Yes | Yes |
Expand Down
2 changes: 2 additions & 0 deletions lmdeploy/turbomind/deploy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import inspect
import json
from dataclasses import asdict, fields
from typing import List

# use pydantic.dataclasses.dataclass to check data type
from pydantic.dataclasses import dataclass
Expand Down Expand Up @@ -73,6 +74,7 @@ class AttentionConfig:
high_freq_factor: float = 1.0
beta_fast: float = 32.0
beta_slow: float = 1.0
mrope_section: List[int] = None
use_logn_attn: int = 0
cache_block_seq_len: int = 64

Expand Down
9 changes: 9 additions & 0 deletions lmdeploy/turbomind/deploy/source_model/qwen.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,4 +119,13 @@ def tokenizer_info(self):
def model_info(self):
cfg = super().model_info()
cfg['attn_bias'] = 1
params_path = osp.join(self.model_path, 'config.json')
with open(params_path) as f:
config = json.load(f)
rope_scaling = config['rope_scaling']
if rope_scaling is not None:
if rope_scaling.get('type', '') == 'mrope':
selection = rope_scaling['mrope_section']
cfg['rope_scaling_type'] = 'mrope'
cfg['mrope_section'] = selection
return cfg
2 changes: 2 additions & 0 deletions lmdeploy/turbomind/supported_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
QWenLMHeadModel='qwen',
# Qwen2
Qwen2ForCausalLM='qwen2',
# # Qwen2-VL
Qwen2VLForConditionalGeneration='qwen2',
# mistral
MistralForCausalLM='llama',
# llava
Expand Down
22 changes: 22 additions & 0 deletions lmdeploy/turbomind/turbomind.py
Original file line number Diff line number Diff line change
Expand Up @@ -515,6 +515,8 @@ def prepare_inputs(self,
gen_config: GenerationConfig,
input_embeddings=None,
input_embedding_ranges=None,
mrope_position_ids=None,
mrope_position_delta=None,
sequence_start: bool = True,
sequence_end: bool = False,
step=0,
Expand Down Expand Up @@ -572,6 +574,18 @@ def _broadcast_np(data, dtype, shape=(batch_size, )):
inputs['input_embeddings'] = input_embeddings
inputs['input_embedding_ranges'] = input_embedding_ranges

if mrope_position_ids is not None:
assert isinstance(mrope_position_ids, torch.Tensor)
assert isinstance(mrope_position_delta, torch.Tensor)
assert input_lengths.size(0) == 1
assert mrope_position_ids.size(-1) == input_ids.size(-1)
mrope_position_ids = pad_sequence([mrope_position_ids],
batch_first=True,
padding_value=-1).transpose(
1, 2).int().reshape(1, -1)
inputs['mrope_position_ids'] = mrope_position_ids
inputs['mrope_position_delta'] = mrope_position_delta

if gen_config.min_new_tokens is not None:
inputs['min_length'] = _broadcast_np(gen_config.min_new_tokens,
np.int32)
Expand Down Expand Up @@ -611,6 +625,8 @@ async def async_stream_infer(self,
input_ids,
input_embeddings=None,
input_embedding_ranges=None,
mrope_position_ids=None,
mrope_position_delta=None,
sequence_start: bool = True,
sequence_end: bool = False,
step=0,
Expand Down Expand Up @@ -648,6 +664,8 @@ async def async_stream_infer(self,
input_ids=input_ids,
input_embeddings=input_embeddings,
input_embedding_ranges=input_embedding_ranges,
mrope_position_ids=mrope_position_ids,
mrope_position_delta=mrope_position_delta,
sequence_start=sequence_start,
sequence_end=sequence_end,
step=step,
Expand Down Expand Up @@ -734,6 +752,8 @@ def stream_infer(self,
input_ids,
input_embeddings=None,
input_embedding_ranges=None,
mrope_position_ids=None,
mrope_position_delta=None,
sequence_start: bool = True,
sequence_end: bool = False,
step=0,
Expand Down Expand Up @@ -766,6 +786,8 @@ def stream_infer(self,
input_ids=input_ids,
input_embeddings=input_embeddings,
input_embedding_ranges=input_embedding_ranges,
mrope_position_ids=mrope_position_ids,
mrope_position_delta=mrope_position_delta,
sequence_start=sequence_start,
sequence_end=sequence_end,
step=step,
Expand Down
8 changes: 7 additions & 1 deletion src/turbomind/kernels/attention/attention_params.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,16 @@ struct AttentionParams {
float llama3_inv_scaling_factor;
float llama3_alpha;
float llama3_beta;
// the following are use by yarn
// the following are used by yarn
float yarn_ramp_inv_factor_div_2;
float yarn_ramp_inv_factor_mul_min;
float yarn_inv_scaling_factor;
// the following are used by qwen2-vl
int3 mrope_section;
int* mrope_position_ids; // 3 x session_len_
int mrope_offset; // session_len_
int* mrope_position_delta;
int* mrope_position_length;

// log(n) attention
bool use_logn_attn;
Expand Down
12 changes: 12 additions & 0 deletions src/turbomind/kernels/attention/attention_universal.h
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,14 @@ struct AttentionUniversal {

ApplyBias(vec_Q, vec_K, vec_V, params, head_idx, kv_head_idx, offset);

int* mrope_ids = nullptr;
int mrope_length = 0;
int mrope_delta = 0;
if (params.mrope_position_ids != nullptr) {
mrope_ids = params.mrope_position_ids + batch_idx * 3 * params.mrope_offset;
mrope_length = params.mrope_position_length[batch_idx];
mrope_delta = params.mrope_position_delta[batch_idx];
}
const float rope_base = params.rope_theta ? params.rope_theta[batch_idx] : params.rotary_embedding_base;
PRAGMA_UNROLL
for (int c = 0; c < ITER_C; ++c) {
Expand All @@ -239,6 +247,10 @@ struct AttentionUniversal {
params.yarn_ramp_inv_factor_mul_min,
params.yarn_inv_scaling_factor,
params.attention_scaling,
params.mrope_section,
mrope_ids,
mrope_length,
mrope_delta,
std::integral_constant<int, kVecSize>{});
PRAGMA_UNROLL
for (int s = 0; s < ITER_S; ++s) {
Expand Down
64 changes: 64 additions & 0 deletions src/turbomind/kernels/attention/kv_cache_utils_v2.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@ __global__ void __launch_bounds__(128) ProcessKV_v2(char** blocks,
float yarn_ramp_inv_factor_mul_min,
float yarn_inv_scaling_factor,
float attention_scaling,
int3 mrope_section,
int* mrope_position_ids,
int mrope_offset,
int* mrope_position_delta,
int* mrope_position_length,
int64_t stride_b,
int64_t stride_c,
int64_t stride_h,
Expand Down Expand Up @@ -125,6 +130,14 @@ __global__ void __launch_bounds__(128) ProcessKV_v2(char** blocks,
}

if (rope_base) {
int* mrope_ids = nullptr;
int mrope_length = 0;
int mrope_delta = 0;
if (mrope_position_ids != nullptr) {
mrope_ids = mrope_position_ids + batch_idx * 3 * mrope_offset;
mrope_length = mrope_position_length[batch_idx];
mrope_delta = mrope_position_delta[batch_idx];
}
float base = rope_base[batch_idx];
PRAGMA_UNROLL
for (int c = 0; c < ITER_C; ++c) {
Expand All @@ -141,6 +154,10 @@ __global__ void __launch_bounds__(128) ProcessKV_v2(char** blocks,
yarn_ramp_inv_factor_mul_min,
yarn_inv_scaling_factor,
attention_scaling,
mrope_section,
mrope_ids,
mrope_length,
mrope_delta,
std::integral_constant<int, kVecSize>{});
PRAGMA_UNROLL
for (int s = 0; s < ITER_S; ++s) {
Expand Down Expand Up @@ -222,6 +239,11 @@ void invokeProcessKV_v2(char** blocks,
float yarn_ramp_inv_factor_mul_min,
float yarn_inv_scaling_factor,
float attention_scaling,
int3 mrope_section,
int* mrope_position_ids,
int mrope_offset,
int* mrope_position_delta,
int* mrope_position_length,
int64_t stride_b,
int64_t stride_c,
int64_t stride_h,
Expand Down Expand Up @@ -268,6 +290,11 @@ void invokeProcessKV_v2(char** blocks,
yarn_ramp_inv_factor_mul_min,
yarn_inv_scaling_factor,
attention_scaling,
mrope_section,
mrope_position_ids,
mrope_offset,
mrope_position_delta,
mrope_position_length,
stride_b,
stride_c,
stride_h,
Expand Down Expand Up @@ -307,6 +334,11 @@ void invokeProcessKV_v2(char** blocks,
float yarn_ramp_inv_factor_mul_min, \
float yarn_inv_scaling_factor, \
float attention_scaling, \
int3 mrope_section, \
int* mrope_position_ids, \
int mrope_offset, \
int* mrope_position_delta, \
int* mrope_position_length, \
int64_t stride_b, \
int64_t stride_c, \
int64_t stride_h, \
Expand Down Expand Up @@ -342,6 +374,11 @@ __global__ void __launch_bounds__(128) flattenKV_v2(T* k,
float yarn_ramp_inv_factor_mul_min,
float yarn_inv_scaling_factor,
float attention_scaling,
int3 mrope_section,
int* mrope_position_ids,
int mrope_offset,
int* mrope_position_delta,
int* mrope_position_length,
int64_t stride_b,
int64_t stride_c,
int64_t stride_h,
Expand Down Expand Up @@ -419,6 +456,14 @@ __global__ void __launch_bounds__(128) flattenKV_v2(T* k,
}

if (rope_base) {
int* mrope_ids = nullptr;
int mrope_length = 0;
int mrope_delta = 0;
if (mrope_position_ids != nullptr) {
mrope_ids = mrope_position_ids + batch_idx * 3 * mrope_offset;
mrope_length = mrope_position_length[batch_idx];
mrope_delta = mrope_position_delta[batch_idx];
}
float base = rope_base[batch_idx];
PRAGMA_UNROLL
for (int c = 0; c < ITER_C; ++c) {
Expand All @@ -435,6 +480,10 @@ __global__ void __launch_bounds__(128) flattenKV_v2(T* k,
yarn_ramp_inv_factor_mul_min,
yarn_inv_scaling_factor,
attention_scaling,
mrope_section,
mrope_ids,
mrope_length,
mrope_delta,
std::integral_constant<int, kVecSize>{});
PRAGMA_UNROLL
for (int s = 0; s < ITER_S; ++s) {
Expand Down Expand Up @@ -477,6 +526,11 @@ void invokeFlattenKV_v2(T* k,
float yarn_ramp_inv_factor_mul_min,
float yarn_inv_scaling_factor,
float attention_scaling,
int3 mrope_section,
int* mrope_position_ids,
int mrope_offset,
int* mrope_position_delta,
int* mrope_position_length,
int64_t stride_b,
int64_t stride_c,
int64_t stride_h,
Expand Down Expand Up @@ -520,6 +574,11 @@ void invokeFlattenKV_v2(T* k,
yarn_ramp_inv_factor_mul_min,
yarn_inv_scaling_factor,
attention_scaling,
mrope_section,
mrope_position_ids,
mrope_offset,
mrope_position_delta,
mrope_position_length,
stride_b,
stride_c,
stride_h,
Expand Down Expand Up @@ -556,6 +615,11 @@ void invokeFlattenKV_v2(T* k,
float yarn_ramp_inv_factor_mul_min, \
float yarn_inv_scaling_factor, \
float attention_scaling, \
int3 mrope_section, \
int* mrope_position_ids, \
int mrope_offset, \
int* mrope_position_delta, \
int* mrope_position_length, \
int64_t stride_b, \
int64_t stride_c, \
int64_t stride_h, \
Expand Down
Loading

0 comments on commit 90a7a24

Please sign in to comment.