Skip to content

Commit a315d2d

Browse files
authored
Update TensorRT-LLM backend (#715)
1 parent 071ee5e commit a315d2d

File tree

23 files changed

+797
-107
lines changed

23 files changed

+797
-107
lines changed

all_models/inflight_batcher_llm/preprocessing/1/model.py

Lines changed: 132 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import io
2929
import json
3030
import os
31+
from collections import defaultdict
3132
from typing import List
3233

3334
import numpy as np
@@ -64,8 +65,8 @@ def initialize(self, args):
6465

6566
add_special_tokens = model_config['parameters'].get(
6667
'add_special_tokens')
67-
visual_model_path = model_config['parameters']['visual_model_path'][
68-
'string_value']
68+
multimodal_model_path = model_config['parameters'][
69+
'multimodal_model_path']['string_value']
6970
max_num_images = model_config['parameters'].get('max_num_images')
7071

7172
if max_num_images is not None:
@@ -82,8 +83,8 @@ def initialize(self, args):
8283
f"[TensorRT-LLM][WARNING] Don't setup 'max_num_images'. Set it as None by default."
8384
)
8485
self.max_num_images = None
85-
if visual_model_path == "${visual_model_path}" or visual_model_path == "":
86-
visual_model_path = None
86+
if multimodal_model_path == "${multimodal_model_path}" or multimodal_model_path == "":
87+
multimodal_model_path = None
8788

8889
if add_special_tokens is not None:
8990
add_special_tokens_str = add_special_tokens['string_value'].lower()
@@ -125,17 +126,19 @@ def initialize(self, args):
125126
self.model_type = None
126127
self.vision_preprocessor = None
127128

128-
if visual_model_path is not None:
129+
if multimodal_model_path is not None:
129130
self.is_multimodal = True
130-
visual_model_path = os.path.join(visual_model_path, 'config.json')
131-
with open(visual_model_path, 'r') as f:
131+
multimodal_model_path = os.path.join(multimodal_model_path,
132+
'config.json')
133+
with open(multimodal_model_path, 'r') as f:
132134
visual_model_config = json.load(f)
133135
self.model_type = visual_model_config['builder_config'][
134136
'model_type']
135137

136138
assert self.model_type in [
137-
'llava', 'blip2-opt', 'vila', 'mllama', 'llava_onevision'
138-
], f"[TensorRT-LLM][ERROR] Currently supported multi-modal models are llava, blip2-opt, vila, mllama and llava_onevision. Got {self.model_type}."
139+
'llava', 'blip2-opt', 'vila', 'mllama', 'llava_onevision',
140+
'qwen2_vl'
141+
], f"[TensorRT-LLM][ERROR] Currently supported multi-modal models are llava, blip2-opt, vila, mllama, llava_onevision and qwen2_vl. Got {self.model_type}."
139142

140143
assert self.model_type != 'llava_onevison' or self.max_num_images is None or self.max_num_images <= 1, f"LLaVA-OneVsion is not support multi image inference currently."
141144

@@ -148,7 +151,7 @@ def initialize(self, args):
148151
llm_model_config["pretrained_config"]["vocab_size"])
149152
self._setup_ptable_shape(llm_model_config)
150153

151-
if self.model_type == 'mllama' or self.model_type == 'llava_onevision':
154+
if self.model_type in ['mllama', 'llava_onevision', 'qwen2_vl']:
152155
self.vision_preprocessor = VisionPreProcessor(
153156
self.model_type,
154157
AutoProcessor.from_pretrained(tokenizer_dir), model_config)
@@ -182,12 +185,12 @@ def _setup_ptable_shape(self, llm_model_config):
182185
'max_prompt_embedding_table_size']
183186
max_batch_size = llm_model_config['build_config']['max_batch_size']
184187

185-
num_visual_features = max_prompt_embedding_table_size // max_batch_size
188+
num_multimodal_features = max_prompt_embedding_table_size // max_batch_size
186189
hidden_size = llm_model_config['pretrained_config']['hidden_size']
187190
if self.max_num_images is not None:
188-
num_visual_features = num_visual_features // self.max_num_images
191+
num_multimodal_features = num_multimodal_features // self.max_num_images
189192

190-
self.ptable_shape = (-1, num_visual_features, hidden_size)
193+
self.ptable_shape = (-1, num_multimodal_features, hidden_size)
191194

192195
def execute(self, requests):
193196
"""`execute` must be implemented in every Python model. `execute`
@@ -303,6 +306,18 @@ def execute(self, requests):
303306
queries=query.astype(str).tolist(),
304307
video_bytes=video_bytes,
305308
)
309+
elif self.model_type == 'qwen2_vl':
310+
processed_tensors = self.vision_preprocessor.qwen2_vl_process_image(
311+
queries=query.astype(str).tolist(),
312+
img_urls=img_urls,
313+
image_bytes=image_bytes,
314+
)
315+
qwen2vl_input_id_tensor = processed_tensors.get(
316+
"INPUT_IDS")
317+
processed_tensors.pop("INPUT_IDS")
318+
qwen2vl_input_length_tensor = processed_tensors.get(
319+
"REQUEST_INPUT_LEN")
320+
processed_tensors.pop("REQUEST_INPUT_LEN")
306321
else:
307322
raise ValueError(
308323
"Unsupported model type for IMAGE_BYTES or IMAGE_URL inputs"
@@ -315,7 +330,7 @@ def execute(self, requests):
315330
assert self.model_type != "llava_onevision", "Image processing requires IMAGE_BYTES or IMAGE_URL to be provided"
316331

317332
# Preprocessing input data.
318-
# For the LLaVA_OneVision model, num_visual_features is not a fixed value
333+
# For the LLaVA_OneVision model, num_multimodal_features is not a fixed value
319334
input_id, request_input_len = self._create_request(
320335
query, visual_tokens)
321336
if decoder_query is not None:
@@ -333,7 +348,7 @@ def execute(self, requests):
333348
embedding_bias_words, embedding_bias_weights,
334349
self.embedding_bias_weights_dtype, batch_size)
335350

336-
if prompt_table_extra_id is not None:
351+
if prompt_table_extra_id is not None and self.model_type != 'qwen2_vl':
337352
prompt_table_extra_ids = np.zeros_like(input_id)
338353
for i in range(batch_size):
339354
prompt_table_extra_ids[i] = np.where(
@@ -342,11 +357,18 @@ def execute(self, requests):
342357

343358
# Create output tensors. You need pb_utils.Tensor
344359
# objects to create pb_utils.InferenceResponse.
345-
input_id_tensor = pb_utils.Tensor(
346-
'INPUT_ID', input_id.astype(self.input_id_dtype))
347-
request_input_len_tensor = pb_utils.Tensor(
348-
'REQUEST_INPUT_LEN',
349-
request_input_len.astype(self.request_input_len_dtype))
360+
# Qwen2-VL model has special logic to process input ids
361+
if self.model_type == 'qwen2_vl':
362+
input_id_tensor = pb_utils.Tensor.from_dlpack(
363+
'INPUT_ID', qwen2vl_input_id_tensor)
364+
request_input_len_tensor = pb_utils.Tensor.from_dlpack(
365+
'REQUEST_INPUT_LEN', qwen2vl_input_length_tensor)
366+
else:
367+
input_id_tensor = pb_utils.Tensor(
368+
'INPUT_ID', input_id.astype(self.input_id_dtype))
369+
request_input_len_tensor = pb_utils.Tensor(
370+
'REQUEST_INPUT_LEN',
371+
request_input_len.astype(self.request_input_len_dtype))
350372
decoder_input_id_tensor = pb_utils.Tensor(
351373
'DECODER_INPUT_ID',
352374
decoder_input_id.astype(self.decoder_input_id_dtype))
@@ -365,7 +387,6 @@ def execute(self, requests):
365387
np.array(end_id, dtype=np.int32))
366388
pad_id_tensor = pb_utils.Tensor('OUT_PAD_ID',
367389
np.array(pad_id, dtype=np.int32))
368-
369390
if prompt_table_extra_id is not None:
370391
prompt_table_extra_ids_tensor = pb_utils.Tensor(
371392
'OUT_PROMPT_TABLE_EXTRA_IDS',
@@ -389,7 +410,6 @@ def execute(self, requests):
389410
end_id_tensor, pad_id_tensor
390411
] + vision_processed_tensors)
391412
responses.append(inference_response)
392-
393413
# You should return a list of pb_utils.InferenceResponse. Length
394414
# of this list must match the length of `requests` list.
395415
return responses
@@ -442,7 +462,7 @@ def _setup_fake_prompts(self, batch_size, batch_split_prompts):
442462
np.ndarray: An array of input IDs with image placeholders replaced by fake prompt IDs.
443463
"""
444464

445-
num_visual_features = self.ptable_shape[1]
465+
num_multimodal_features = self.ptable_shape[1]
446466
input_ids_list = []
447467

448468
for batch_idx in range(batch_size):
@@ -453,8 +473,8 @@ def _setup_fake_prompts(self, batch_size, batch_split_prompts):
453473
for split_idx in range(len(splits) - 1):
454474
fake_prompt_id = np.arange(
455475
sample_fake_prompt_counter,
456-
sample_fake_prompt_counter + num_visual_features)
457-
sample_fake_prompt_counter += num_visual_features
476+
sample_fake_prompt_counter + num_multimodal_features)
477+
sample_fake_prompt_counter += num_multimodal_features
458478
fake_prompt_id = np.expand_dims(fake_prompt_id, axis=0)
459479
sample_input_ids.append(fake_prompt_id)
460480
sample_input_ids.append(splits[split_idx + 1])
@@ -528,6 +548,9 @@ def _create_request(self, query, visual_tokens=None):
528548
).astype(int) for s in query
529549
]
530550
else:
551+
# Qwen2-VL input id is calculated when processing image
552+
if 'qwen2_vl' == self.model_type:
553+
return None, None
531554
if self.is_multimodal and self.max_num_images and self.max_num_images > 1:
532555
start_ids = self._process_multi_image_inputs(query)
533556

@@ -553,7 +576,6 @@ def _create_request(self, query, visual_tokens=None):
553576
elif 'llava_onevision' == self.model_type:
554577
pre_prompt = "<|im_start|>user "
555578
post_prompt = "<|im_end|><|im_start|>assistant\n"
556-
557579
pre_prompt_id = np.array(
558580
self.tokenizer.encode(
559581
pre_prompt,
@@ -797,12 +819,11 @@ def mllama_process(self, queries, img_urls=None, image_bytes=None):
797819
images=images[batch_id],
798820
text=queries[batch_id],
799821
return_tensors="pt")
800-
801822
# Reshape pixel_values to [num_images, *HWC/CHW]
802823
val = processed_vision_data["pixel_values"]
803-
804824
val = val.reshape(1, -1, *(val.shape[-3:]))
805825
processed_vision_data["pixel_values"] = val
826+
806827
# Create vision output tensors
807828
for key in possible_output_names:
808829
val = processed_vision_data.get(key.lower())
@@ -850,7 +871,6 @@ def llava_onevision_process_image(self,
850871
processed_vision_data = self.vision_model_processor(
851872
images=images[batch_id], text='<image>', return_tensors="pt")
852873
visual_tokens.append(processed_vision_data['input_ids'].shape[1])
853-
854874
# Create vision output tensors
855875
for key in possible_output_names:
856876
val = processed_vision_data.get(key.lower())
@@ -906,3 +926,86 @@ def llava_onevision_process_video(self, queries, video_bytes=None):
906926
val, self.output_str_dtypes[key])
907927
vision_processed_tensors[key] = val
908928
return vision_processed_tensors, visual_tokens
929+
930+
def qwen2_vl_process_image(self, queries, img_urls=None, image_bytes=None):
931+
import torch
932+
vision_processed_tensors = {}
933+
# Retrieved from https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct/blob/main/config.json
934+
vision_token_id = 151654
935+
image_token_id = 151655
936+
video_token_id = 151656
937+
vocab_size = 152064
938+
939+
if img_urls is not None:
940+
# download and read images
941+
images = [
942+
self.load_images_from_urls(urls)
943+
for urls in img_urls.as_numpy()
944+
]
945+
else:
946+
images = [
947+
img for img_list in self.load_images_tensor(image_bytes)
948+
for img in img_list
949+
]
950+
batch_size = len(images)
951+
preprocessor_outputs = defaultdict(list)
952+
possible_output_names = [
953+
'PIXEL_VALUES', 'IMAGE_GRID_THW', 'ATTENTION_MASK', 'INPUT_IDS'
954+
]
955+
for batch_id in range(batch_size):
956+
messages = [{
957+
"role":
958+
"user",
959+
"content": [
960+
{
961+
"type": "image",
962+
"image": images[batch_id],
963+
},
964+
{
965+
"type":
966+
"text",
967+
"text":
968+
queries[batch_id][0] if isinstance(
969+
queries[batch_id], list) else queries[batch_id],
970+
},
971+
],
972+
}]
973+
text_inputs = self.vision_model_processor.apply_chat_template(
974+
messages, tokenize=False, add_generation_prompt=True)
975+
# Preprocess images and query
976+
processed_vision_data = self.vision_model_processor(
977+
images=images[batch_id],
978+
text=text_inputs,
979+
padding=True,
980+
return_tensors="pt")
981+
982+
# Create vision output tensors
983+
for key in possible_output_names:
984+
val = processed_vision_data.get(key.lower())
985+
if val is not None:
986+
# Add two dummy dim to reshape pixel value tensor to 5 dim
987+
if key == 'PIXEL_VALUES':
988+
val = val.unsqueeze(0).unsqueeze(0).unsqueeze(0)
989+
elif key == 'INPUT_IDS':
990+
val = val.to(torch.int32)
991+
pre_process_val = val.clone()
992+
mask = (val == image_token_id) | (
993+
val == vision_token_id) | (val == video_token_id)
994+
cumulative_counts = mask.cumsum(dim=1,
995+
dtype=torch.int32)
996+
values = (vocab_size - 1) + cumulative_counts
997+
val[mask] = values[mask]
998+
preprocessor_outputs["VISION_INPUT_ID"].append(
999+
pre_process_val)
1000+
preprocessor_outputs["REQUEST_INPUT_LEN"].append(
1001+
torch.tensor([val.shape[1]],
1002+
dtype=torch.int32).unsqueeze(0))
1003+
preprocessor_outputs[key].append(val)
1004+
1005+
for key, tensor_list in preprocessor_outputs.items():
1006+
val = self.convert_tensor_list_to_tensor(tensor_list)
1007+
if key in self.output_str_dtypes:
1008+
val = self.convert_tensor_to_str_dtype(
1009+
val, self.output_str_dtypes[key])
1010+
vision_processed_tensors[key] = val
1011+
return vision_processed_tensors

all_models/inflight_batcher_llm/preprocessing/config.pbtxt

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,22 @@ output [
199199
name: "IS_VIDEO_INPUT"
200200
data_type: TYPE_BOOL
201201
dims: [ 1 ]
202+
},
203+
# Required for Qwen2-VL vision encoder
204+
{
205+
name: "ATTENTION_MASK"
206+
data_type: TYPE_INT64
207+
dims: [ -1 ]
208+
},
209+
{
210+
name: "IMAGE_GRID_THW"
211+
data_type: TYPE_INT64
212+
dims: [ 3 ]
213+
},
214+
{
215+
name: "VISION_INPUT_ID"
216+
data_type: TYPE_INT32
217+
dims: [ -1 ]
202218
}
203219
]
204220
@@ -217,9 +233,9 @@ parameters {
217233
}
218234
219235
parameters {
220-
key: "visual_model_path"
236+
key: "multimodal_model_path"
221237
value: {
222-
string_value: "${visual_model_path}"
238+
string_value: "${multimodal_model_path}"
223239
}
224240
}
225241

all_models/inflight_batcher_llm/tensorrt_llm/1/model.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -466,6 +466,27 @@ def get_lookahead_decoding_config_from_request(request,
466466
lookahead_verification_set_size)
467467

468468

469+
def get_mrope_config_from_request(request, batch_size=1, batch_index=0):
470+
mrope_rotary_cos_sin = get_input_tensor_by_name(request,
471+
'mrope_rotary_cos_sin',
472+
batch_size, batch_index)
473+
mrope_position_deltas = get_input_tensor_by_name(request,
474+
'mrope_position_deltas',
475+
batch_size,
476+
batch_index,
477+
force_on_torch=False)
478+
assert (mrope_rotary_cos_sin is None) == (
479+
mrope_position_deltas is None
480+
), "Both mrope_rotary_cos_sin and mrope_position_detals must be either None or not None."
481+
482+
if mrope_rotary_cos_sin is not None and mrope_position_deltas is not None:
483+
mrope_config = trtllm.MropeConfig(
484+
mrope_rotary_cos_sin=mrope_rotary_cos_sin[0],
485+
mrope_position_deltas=mrope_position_deltas[0])
486+
return mrope_config
487+
return None
488+
489+
469490
def build_1_2_5_buckets(max_value: int) -> List[int]:
470491
"""
471492
Builds a list of buckets with increasing powers of 10 multiplied by
@@ -564,6 +585,8 @@ def convert_request(request,
564585
request, batch_size, batch_index)
565586
prompt_tuning_config = get_prompt_tuning_config_from_request(
566587
request, batch_size, batch_index, input_length)
588+
mrope_config = get_mrope_config_from_request(request, batch_size,
589+
batch_index)
567590
lora_config = get_lora_config_from_request(request, batch_size,
568591
batch_index)
569592
kv_cache_retention_config = get_kv_cache_retention_config_from_request(
@@ -621,6 +644,7 @@ def convert_request(request,
621644
output_config=output_config,
622645
external_draft_tokens_config=external_draft_tokens_config,
623646
prompt_tuning_config=prompt_tuning_config,
647+
mrope_config=mrope_config,
624648
lora_config=lora_config,
625649
guided_decoding_params=guided_decoding_params,
626650
lookahead_config=request_lookahead_config,

0 commit comments

Comments
 (0)