28
28
import io
29
29
import json
30
30
import os
31
+ from collections import defaultdict
31
32
from typing import List
32
33
33
34
import numpy as np
@@ -64,8 +65,8 @@ def initialize(self, args):
64
65
65
66
add_special_tokens = model_config ['parameters' ].get (
66
67
'add_special_tokens' )
67
- visual_model_path = model_config ['parameters' ][ 'visual_model_path ' ][
68
- 'string_value' ]
68
+ multimodal_model_path = model_config ['parameters' ][
69
+ 'multimodal_model_path' ][ ' string_value' ]
69
70
max_num_images = model_config ['parameters' ].get ('max_num_images' )
70
71
71
72
if max_num_images is not None :
@@ -82,8 +83,8 @@ def initialize(self, args):
82
83
f"[TensorRT-LLM][WARNING] Don't setup 'max_num_images'. Set it as None by default."
83
84
)
84
85
self .max_num_images = None
85
- if visual_model_path == "${visual_model_path }" or visual_model_path == "" :
86
- visual_model_path = None
86
+ if multimodal_model_path == "${multimodal_model_path }" or multimodal_model_path == "" :
87
+ multimodal_model_path = None
87
88
88
89
if add_special_tokens is not None :
89
90
add_special_tokens_str = add_special_tokens ['string_value' ].lower ()
@@ -125,17 +126,19 @@ def initialize(self, args):
125
126
self .model_type = None
126
127
self .vision_preprocessor = None
127
128
128
- if visual_model_path is not None :
129
+ if multimodal_model_path is not None :
129
130
self .is_multimodal = True
130
- visual_model_path = os .path .join (visual_model_path , 'config.json' )
131
- with open (visual_model_path , 'r' ) as f :
131
+ multimodal_model_path = os .path .join (multimodal_model_path ,
132
+ 'config.json' )
133
+ with open (multimodal_model_path , 'r' ) as f :
132
134
visual_model_config = json .load (f )
133
135
self .model_type = visual_model_config ['builder_config' ][
134
136
'model_type' ]
135
137
136
138
assert self .model_type in [
137
- 'llava' , 'blip2-opt' , 'vila' , 'mllama' , 'llava_onevision'
138
- ], f"[TensorRT-LLM][ERROR] Currently supported multi-modal models are llava, blip2-opt, vila, mllama and llava_onevision. Got { self .model_type } ."
139
+ 'llava' , 'blip2-opt' , 'vila' , 'mllama' , 'llava_onevision' ,
140
+ 'qwen2_vl'
141
+ ], f"[TensorRT-LLM][ERROR] Currently supported multi-modal models are llava, blip2-opt, vila, mllama, llava_onevision and qwen2_vl. Got { self .model_type } ."
139
142
140
143
assert self .model_type != 'llava_onevison' or self .max_num_images is None or self .max_num_images <= 1 , f"LLaVA-OneVsion is not support multi image inference currently."
141
144
@@ -148,7 +151,7 @@ def initialize(self, args):
148
151
llm_model_config ["pretrained_config" ]["vocab_size" ])
149
152
self ._setup_ptable_shape (llm_model_config )
150
153
151
- if self .model_type == 'mllama' or self . model_type == 'llava_onevision' :
154
+ if self .model_type in [ 'mllama' , 'llava_onevision' , 'qwen2_vl' ] :
152
155
self .vision_preprocessor = VisionPreProcessor (
153
156
self .model_type ,
154
157
AutoProcessor .from_pretrained (tokenizer_dir ), model_config )
@@ -182,12 +185,12 @@ def _setup_ptable_shape(self, llm_model_config):
182
185
'max_prompt_embedding_table_size' ]
183
186
max_batch_size = llm_model_config ['build_config' ]['max_batch_size' ]
184
187
185
- num_visual_features = max_prompt_embedding_table_size // max_batch_size
188
+ num_multimodal_features = max_prompt_embedding_table_size // max_batch_size
186
189
hidden_size = llm_model_config ['pretrained_config' ]['hidden_size' ]
187
190
if self .max_num_images is not None :
188
- num_visual_features = num_visual_features // self .max_num_images
191
+ num_multimodal_features = num_multimodal_features // self .max_num_images
189
192
190
- self .ptable_shape = (- 1 , num_visual_features , hidden_size )
193
+ self .ptable_shape = (- 1 , num_multimodal_features , hidden_size )
191
194
192
195
def execute (self , requests ):
193
196
"""`execute` must be implemented in every Python model. `execute`
@@ -303,6 +306,18 @@ def execute(self, requests):
303
306
queries = query .astype (str ).tolist (),
304
307
video_bytes = video_bytes ,
305
308
)
309
+ elif self .model_type == 'qwen2_vl' :
310
+ processed_tensors = self .vision_preprocessor .qwen2_vl_process_image (
311
+ queries = query .astype (str ).tolist (),
312
+ img_urls = img_urls ,
313
+ image_bytes = image_bytes ,
314
+ )
315
+ qwen2vl_input_id_tensor = processed_tensors .get (
316
+ "INPUT_IDS" )
317
+ processed_tensors .pop ("INPUT_IDS" )
318
+ qwen2vl_input_length_tensor = processed_tensors .get (
319
+ "REQUEST_INPUT_LEN" )
320
+ processed_tensors .pop ("REQUEST_INPUT_LEN" )
306
321
else :
307
322
raise ValueError (
308
323
"Unsupported model type for IMAGE_BYTES or IMAGE_URL inputs"
@@ -315,7 +330,7 @@ def execute(self, requests):
315
330
assert self .model_type != "llava_onevision" , "Image processing requires IMAGE_BYTES or IMAGE_URL to be provided"
316
331
317
332
# Preprocessing input data.
318
- # For the LLaVA_OneVision model, num_visual_features is not a fixed value
333
+ # For the LLaVA_OneVision model, num_multimodal_features is not a fixed value
319
334
input_id , request_input_len = self ._create_request (
320
335
query , visual_tokens )
321
336
if decoder_query is not None :
@@ -333,7 +348,7 @@ def execute(self, requests):
333
348
embedding_bias_words , embedding_bias_weights ,
334
349
self .embedding_bias_weights_dtype , batch_size )
335
350
336
- if prompt_table_extra_id is not None :
351
+ if prompt_table_extra_id is not None and self . model_type != 'qwen2_vl' :
337
352
prompt_table_extra_ids = np .zeros_like (input_id )
338
353
for i in range (batch_size ):
339
354
prompt_table_extra_ids [i ] = np .where (
@@ -342,11 +357,18 @@ def execute(self, requests):
342
357
343
358
# Create output tensors. You need pb_utils.Tensor
344
359
# objects to create pb_utils.InferenceResponse.
345
- input_id_tensor = pb_utils .Tensor (
346
- 'INPUT_ID' , input_id .astype (self .input_id_dtype ))
347
- request_input_len_tensor = pb_utils .Tensor (
348
- 'REQUEST_INPUT_LEN' ,
349
- request_input_len .astype (self .request_input_len_dtype ))
360
+ # Qwen2-VL model has special logic to process input ids
361
+ if self .model_type == 'qwen2_vl' :
362
+ input_id_tensor = pb_utils .Tensor .from_dlpack (
363
+ 'INPUT_ID' , qwen2vl_input_id_tensor )
364
+ request_input_len_tensor = pb_utils .Tensor .from_dlpack (
365
+ 'REQUEST_INPUT_LEN' , qwen2vl_input_length_tensor )
366
+ else :
367
+ input_id_tensor = pb_utils .Tensor (
368
+ 'INPUT_ID' , input_id .astype (self .input_id_dtype ))
369
+ request_input_len_tensor = pb_utils .Tensor (
370
+ 'REQUEST_INPUT_LEN' ,
371
+ request_input_len .astype (self .request_input_len_dtype ))
350
372
decoder_input_id_tensor = pb_utils .Tensor (
351
373
'DECODER_INPUT_ID' ,
352
374
decoder_input_id .astype (self .decoder_input_id_dtype ))
@@ -365,7 +387,6 @@ def execute(self, requests):
365
387
np .array (end_id , dtype = np .int32 ))
366
388
pad_id_tensor = pb_utils .Tensor ('OUT_PAD_ID' ,
367
389
np .array (pad_id , dtype = np .int32 ))
368
-
369
390
if prompt_table_extra_id is not None :
370
391
prompt_table_extra_ids_tensor = pb_utils .Tensor (
371
392
'OUT_PROMPT_TABLE_EXTRA_IDS' ,
@@ -389,7 +410,6 @@ def execute(self, requests):
389
410
end_id_tensor , pad_id_tensor
390
411
] + vision_processed_tensors )
391
412
responses .append (inference_response )
392
-
393
413
# You should return a list of pb_utils.InferenceResponse. Length
394
414
# of this list must match the length of `requests` list.
395
415
return responses
@@ -442,7 +462,7 @@ def _setup_fake_prompts(self, batch_size, batch_split_prompts):
442
462
np.ndarray: An array of input IDs with image placeholders replaced by fake prompt IDs.
443
463
"""
444
464
445
- num_visual_features = self .ptable_shape [1 ]
465
+ num_multimodal_features = self .ptable_shape [1 ]
446
466
input_ids_list = []
447
467
448
468
for batch_idx in range (batch_size ):
@@ -453,8 +473,8 @@ def _setup_fake_prompts(self, batch_size, batch_split_prompts):
453
473
for split_idx in range (len (splits ) - 1 ):
454
474
fake_prompt_id = np .arange (
455
475
sample_fake_prompt_counter ,
456
- sample_fake_prompt_counter + num_visual_features )
457
- sample_fake_prompt_counter += num_visual_features
476
+ sample_fake_prompt_counter + num_multimodal_features )
477
+ sample_fake_prompt_counter += num_multimodal_features
458
478
fake_prompt_id = np .expand_dims (fake_prompt_id , axis = 0 )
459
479
sample_input_ids .append (fake_prompt_id )
460
480
sample_input_ids .append (splits [split_idx + 1 ])
@@ -528,6 +548,9 @@ def _create_request(self, query, visual_tokens=None):
528
548
).astype (int ) for s in query
529
549
]
530
550
else :
551
+ # Qwen2-VL input id is calculated when processing image
552
+ if 'qwen2_vl' == self .model_type :
553
+ return None , None
531
554
if self .is_multimodal and self .max_num_images and self .max_num_images > 1 :
532
555
start_ids = self ._process_multi_image_inputs (query )
533
556
@@ -553,7 +576,6 @@ def _create_request(self, query, visual_tokens=None):
553
576
elif 'llava_onevision' == self .model_type :
554
577
pre_prompt = "<|im_start|>user "
555
578
post_prompt = "<|im_end|><|im_start|>assistant\n "
556
-
557
579
pre_prompt_id = np .array (
558
580
self .tokenizer .encode (
559
581
pre_prompt ,
@@ -797,12 +819,11 @@ def mllama_process(self, queries, img_urls=None, image_bytes=None):
797
819
images = images [batch_id ],
798
820
text = queries [batch_id ],
799
821
return_tensors = "pt" )
800
-
801
822
# Reshape pixel_values to [num_images, *HWC/CHW]
802
823
val = processed_vision_data ["pixel_values" ]
803
-
804
824
val = val .reshape (1 , - 1 , * (val .shape [- 3 :]))
805
825
processed_vision_data ["pixel_values" ] = val
826
+
806
827
# Create vision output tensors
807
828
for key in possible_output_names :
808
829
val = processed_vision_data .get (key .lower ())
@@ -850,7 +871,6 @@ def llava_onevision_process_image(self,
850
871
processed_vision_data = self .vision_model_processor (
851
872
images = images [batch_id ], text = '<image>' , return_tensors = "pt" )
852
873
visual_tokens .append (processed_vision_data ['input_ids' ].shape [1 ])
853
-
854
874
# Create vision output tensors
855
875
for key in possible_output_names :
856
876
val = processed_vision_data .get (key .lower ())
@@ -906,3 +926,86 @@ def llava_onevision_process_video(self, queries, video_bytes=None):
906
926
val , self .output_str_dtypes [key ])
907
927
vision_processed_tensors [key ] = val
908
928
return vision_processed_tensors , visual_tokens
929
+
930
+ def qwen2_vl_process_image (self , queries , img_urls = None , image_bytes = None ):
931
+ import torch
932
+ vision_processed_tensors = {}
933
+ # Retrieved from https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct/blob/main/config.json
934
+ vision_token_id = 151654
935
+ image_token_id = 151655
936
+ video_token_id = 151656
937
+ vocab_size = 152064
938
+
939
+ if img_urls is not None :
940
+ # download and read images
941
+ images = [
942
+ self .load_images_from_urls (urls )
943
+ for urls in img_urls .as_numpy ()
944
+ ]
945
+ else :
946
+ images = [
947
+ img for img_list in self .load_images_tensor (image_bytes )
948
+ for img in img_list
949
+ ]
950
+ batch_size = len (images )
951
+ preprocessor_outputs = defaultdict (list )
952
+ possible_output_names = [
953
+ 'PIXEL_VALUES' , 'IMAGE_GRID_THW' , 'ATTENTION_MASK' , 'INPUT_IDS'
954
+ ]
955
+ for batch_id in range (batch_size ):
956
+ messages = [{
957
+ "role" :
958
+ "user" ,
959
+ "content" : [
960
+ {
961
+ "type" : "image" ,
962
+ "image" : images [batch_id ],
963
+ },
964
+ {
965
+ "type" :
966
+ "text" ,
967
+ "text" :
968
+ queries [batch_id ][0 ] if isinstance (
969
+ queries [batch_id ], list ) else queries [batch_id ],
970
+ },
971
+ ],
972
+ }]
973
+ text_inputs = self .vision_model_processor .apply_chat_template (
974
+ messages , tokenize = False , add_generation_prompt = True )
975
+ # Preprocess images and query
976
+ processed_vision_data = self .vision_model_processor (
977
+ images = images [batch_id ],
978
+ text = text_inputs ,
979
+ padding = True ,
980
+ return_tensors = "pt" )
981
+
982
+ # Create vision output tensors
983
+ for key in possible_output_names :
984
+ val = processed_vision_data .get (key .lower ())
985
+ if val is not None :
986
+ # Add two dummy dim to reshape pixel value tensor to 5 dim
987
+ if key == 'PIXEL_VALUES' :
988
+ val = val .unsqueeze (0 ).unsqueeze (0 ).unsqueeze (0 )
989
+ elif key == 'INPUT_IDS' :
990
+ val = val .to (torch .int32 )
991
+ pre_process_val = val .clone ()
992
+ mask = (val == image_token_id ) | (
993
+ val == vision_token_id ) | (val == video_token_id )
994
+ cumulative_counts = mask .cumsum (dim = 1 ,
995
+ dtype = torch .int32 )
996
+ values = (vocab_size - 1 ) + cumulative_counts
997
+ val [mask ] = values [mask ]
998
+ preprocessor_outputs ["VISION_INPUT_ID" ].append (
999
+ pre_process_val )
1000
+ preprocessor_outputs ["REQUEST_INPUT_LEN" ].append (
1001
+ torch .tensor ([val .shape [1 ]],
1002
+ dtype = torch .int32 ).unsqueeze (0 ))
1003
+ preprocessor_outputs [key ].append (val )
1004
+
1005
+ for key , tensor_list in preprocessor_outputs .items ():
1006
+ val = self .convert_tensor_list_to_tensor (tensor_list )
1007
+ if key in self .output_str_dtypes :
1008
+ val = self .convert_tensor_to_str_dtype (
1009
+ val , self .output_str_dtypes [key ])
1010
+ vision_processed_tensors [key ] = val
1011
+ return vision_processed_tensors
0 commit comments