diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 206a76e52b7ab..2545635da3200 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -254,6 +254,7 @@ def _process_multimodal( mm_data: MultiModalDataDict, mm_processor_kwargs: Optional[Mapping[str, object]], lora_request: Optional[LoRARequest], + return_mm_hashes: bool = False, ) -> MultiModalInputs: """ Apply the model's multi-modal processor to a multi-modal prompt, @@ -274,7 +275,8 @@ def _process_multimodal( if mm_processor_kwargs is None: mm_processor_kwargs = {} - return mm_processor.apply(prompt, mm_data, mm_processor_kwargs) + return mm_processor.apply(prompt, mm_data, mm_processor_kwargs, + return_mm_hashes) async def _process_multimodal_async( self, @@ -282,6 +284,7 @@ async def _process_multimodal_async( mm_data: MultiModalDataDict, mm_processor_kwargs: Optional[Mapping[str, object]], lora_request: Optional[LoRARequest], + return_mm_hashes: bool = False, ) -> MultiModalInputs: """Async version of :meth:`_process_multimodal`.""" # At the moment on model (PrithviGeoSpatialMAE) requires to be @@ -299,13 +302,15 @@ async def _process_multimodal_async( if mm_processor_kwargs is None: mm_processor_kwargs = {} - return mm_processor.apply(prompt, mm_data, mm_processor_kwargs) + return mm_processor.apply(prompt, mm_data, mm_processor_kwargs, + return_mm_hashes) def _prompt_to_llm_inputs( self, prompt: SingletonPrompt, request_id: str, lora_request: Optional[LoRARequest] = None, + return_mm_hashes: bool = False, ) -> SingletonInputs: """ Extract the singleton inputs from a prompt. @@ -315,6 +320,7 @@ def _prompt_to_llm_inputs( * request_id * prompt: single encoder or decoder input prompt * lora_request: this is only valid for decoder prompts + * return_mm_hashes: whether to return multimodal hashes Returns: @@ -349,6 +355,7 @@ def _prompt_to_llm_inputs( multi_modal_data, mm_processor_kwargs, lora_request=lora_request, + return_mm_hashes=return_mm_hashes, ) return token_inputs( @@ -695,6 +702,7 @@ def _process_decoder_only_prompt( request_id: str, lora_request: Optional[LoRARequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, + return_mm_hashes: bool = False, ) -> DecoderOnlyInputs: """ For decoder-only models: @@ -706,6 +714,7 @@ def _process_decoder_only_prompt( * request_id * lora_request * prompt_adapter_request + * return_mm_hashes Returns: @@ -729,6 +738,7 @@ async def _process_decoder_only_prompt_async( request_id: str, lora_request: Optional[LoRARequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, + return_mm_hashes: bool = False, ) -> DecoderOnlyInputs: """Async version of :meth:`_process_decoder_only_prompt`.""" prompt_comps = await self._prompt_to_llm_inputs_async( @@ -748,9 +758,13 @@ def preprocess( request_id: str, lora_request: Optional[LoRARequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, + return_mm_hashes: bool = False, ) -> ProcessorInputs: """Preprocess the input prompt.""" if self.model_config.is_encoder_decoder: + assert not return_mm_hashes, ( + "Multimodal hashes for encoder-decoder models should not be ", + "returned until they are supported on vLLM V1.") # Encoder-decoder model requires special mapping of # input prompts to encoder & decoder return self._process_encoder_decoder_prompt( @@ -768,6 +782,7 @@ def preprocess( request_id=request_id, lora_request=lora_request, prompt_adapter_request=prompt_adapter_request, + return_mm_hashes=return_mm_hashes, ) async def preprocess_async( @@ -776,9 +791,13 @@ async def preprocess_async( request_id: str, lora_request: Optional[LoRARequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, + return_mm_hashes: bool = False, ) -> ProcessorInputs: """Async version of :meth:`preprocess`.""" if self.model_config.is_encoder_decoder: + assert not return_mm_hashes, ( + "Multimodal hashes for encoder-decoder models should not be ", + "returned until they are supported on vLLM V1.") # Encoder-decoder model requires special mapping of # input prompts to encoder & decoder return await self._process_encoder_decoder_prompt_async( @@ -796,4 +815,5 @@ async def preprocess_async( request_id=request_id, lora_request=lora_request, prompt_adapter_request=prompt_adapter_request, + return_mm_hashes=return_mm_hashes, ) diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 542eb944de9ed..66b79f809bc96 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -767,6 +767,7 @@ def apply( prompt: Union[str, list[int]], mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], + return_mm_hashes: bool = False, ) -> MultiModalInputs: hf_config = self.info.get_hf_config() image_token_id = hf_config.image_token_index @@ -777,7 +778,8 @@ def apply( image_height=-1, ) - result = super().apply(prompt, mm_data, hf_processor_mm_kwargs) + result = super().apply(prompt, mm_data, hf_processor_mm_kwargs, + return_mm_hashes) mm_items = self._to_mm_items(mm_data) mm_item_counts = mm_items.get_all_counts() diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 1816bf5d008d7..cf103edd0bccf 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -780,6 +780,7 @@ def apply( prompt: Union[str, List[int]], mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], + return_mm_hashes: bool = False, ) -> MultiModalInputs: supported_mm_modalities = self.info.get_supported_mm_modalities() if isinstance(prompt, list): @@ -791,7 +792,8 @@ def apply( [index for index, m in enumerate(matches) if m == modality]) for modality in supported_mm_modalities } - result = super().apply(prompt, mm_data, hf_processor_mm_kwargs) + result = super().apply(prompt, mm_data, hf_processor_mm_kwargs, + return_mm_hashes) # Exclude x from placeholders if "image" in result["mm_placeholders"] and \ self.info.get_model_version() == (2, 6): diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index b1ccd8e851c26..f74fa7a466296 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -175,8 +175,10 @@ def apply( prompt: Union[str, list[int]], mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], + return_mm_hashes: bool = False, ) -> MultiModalEncDecInputs: - mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs) + mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs, + return_mm_hashes) # Check that the number of image tokens in the decoder prompt matches # the number of images provided in mm_data diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py index d922329b3a499..3f5faea4f875c 100644 --- a/vllm/model_executor/models/prithvi_geospatial_mae.py +++ b/vllm/model_executor/models/prithvi_geospatial_mae.py @@ -93,6 +93,7 @@ def apply( prompt: Union[str, list[int]], mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], + return_mm_hashes: bool = False, ) -> MultiModalInputs: mm_kwargs = {} diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 7232df074f840..3f13cd8582fe8 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -14,7 +14,6 @@ from transformers import BatchFeature, PretrainedConfig, ProcessorMixin from typing_extensions import assert_never -import vllm.envs as envs from vllm.inputs import InputProcessingContext from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens, @@ -1435,6 +1434,7 @@ def apply( prompt: Union[str, list[int]], mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], + return_mm_hashes: bool = False, ) -> MultiModalInputs: """ Process multi-modal inputs to be used in vLLM. @@ -1451,11 +1451,11 @@ def apply( """ mm_items = self._to_mm_items(mm_data) - # Create MM hashes (only used in V1) + # Create MM hashes to be returned (only used in V1) # TODO: Use these hash keys for caching operations in apply_hf_processor # instead of rehashing. - if envs.VLLM_USE_V1: + if return_mm_hashes: model_id = self.info.model_id mm_hashes = { modality: [ @@ -1554,6 +1554,7 @@ def apply( prompt: Union[str, list[int]], mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], + return_mm_hashes: bool = False, ) -> MultiModalEncDecInputs: """ Process multi-modal inputs to be used in vLLM. @@ -1567,6 +1568,7 @@ def apply( encoder_prompt, mm_data, hf_processor_mm_kwargs, + return_mm_hashes, ) tokenizer = self.info.get_tokenizer() diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 3a3fc69e53e44..d687ed49b71e0 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -131,6 +131,7 @@ def process_inputs( request_id=request_id, lora_request=lora_request, prompt_adapter_request=prompt_adapter_request, + return_mm_hashes=self.use_hash, ) eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)