Skip to content

Commit

Permalink
[Misc][V1] Avoid using envs.VLLM_USE_V1 in mm processing (#14256)
Browse files Browse the repository at this point in the history
Signed-off-by: Roger Wang <[email protected]>
  • Loading branch information
ywang96 authored Mar 5, 2025
1 parent 32985be commit ec79b67
Show file tree
Hide file tree
Showing 7 changed files with 38 additions and 8 deletions.
24 changes: 22 additions & 2 deletions vllm/inputs/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,7 @@ def _process_multimodal(
mm_data: MultiModalDataDict,
mm_processor_kwargs: Optional[Mapping[str, object]],
lora_request: Optional[LoRARequest],
return_mm_hashes: bool = False,
) -> MultiModalInputs:
"""
Apply the model's multi-modal processor to a multi-modal prompt,
Expand All @@ -274,14 +275,16 @@ def _process_multimodal(
if mm_processor_kwargs is None:
mm_processor_kwargs = {}

return mm_processor.apply(prompt, mm_data, mm_processor_kwargs)
return mm_processor.apply(prompt, mm_data, mm_processor_kwargs,
return_mm_hashes)

async def _process_multimodal_async(
self,
prompt: Union[str, List[int]],
mm_data: MultiModalDataDict,
mm_processor_kwargs: Optional[Mapping[str, object]],
lora_request: Optional[LoRARequest],
return_mm_hashes: bool = False,
) -> MultiModalInputs:
"""Async version of :meth:`_process_multimodal`."""
# At the moment on model (PrithviGeoSpatialMAE) requires to be
Expand All @@ -299,13 +302,15 @@ async def _process_multimodal_async(
if mm_processor_kwargs is None:
mm_processor_kwargs = {}

return mm_processor.apply(prompt, mm_data, mm_processor_kwargs)
return mm_processor.apply(prompt, mm_data, mm_processor_kwargs,
return_mm_hashes)

def _prompt_to_llm_inputs(
self,
prompt: SingletonPrompt,
request_id: str,
lora_request: Optional[LoRARequest] = None,
return_mm_hashes: bool = False,
) -> SingletonInputs:
"""
Extract the singleton inputs from a prompt.
Expand All @@ -315,6 +320,7 @@ def _prompt_to_llm_inputs(
* request_id
* prompt: single encoder or decoder input prompt
* lora_request: this is only valid for decoder prompts
* return_mm_hashes: whether to return multimodal hashes
Returns:
Expand Down Expand Up @@ -349,6 +355,7 @@ def _prompt_to_llm_inputs(
multi_modal_data,
mm_processor_kwargs,
lora_request=lora_request,
return_mm_hashes=return_mm_hashes,
)

return token_inputs(
Expand Down Expand Up @@ -695,6 +702,7 @@ def _process_decoder_only_prompt(
request_id: str,
lora_request: Optional[LoRARequest] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
return_mm_hashes: bool = False,
) -> DecoderOnlyInputs:
"""
For decoder-only models:
Expand All @@ -706,6 +714,7 @@ def _process_decoder_only_prompt(
* request_id
* lora_request
* prompt_adapter_request
* return_mm_hashes
Returns:
Expand All @@ -729,6 +738,7 @@ async def _process_decoder_only_prompt_async(
request_id: str,
lora_request: Optional[LoRARequest] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
return_mm_hashes: bool = False,
) -> DecoderOnlyInputs:
"""Async version of :meth:`_process_decoder_only_prompt`."""
prompt_comps = await self._prompt_to_llm_inputs_async(
Expand All @@ -748,9 +758,13 @@ def preprocess(
request_id: str,
lora_request: Optional[LoRARequest] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
return_mm_hashes: bool = False,
) -> ProcessorInputs:
"""Preprocess the input prompt."""
if self.model_config.is_encoder_decoder:
assert not return_mm_hashes, (
"Multimodal hashes for encoder-decoder models should not be ",
"returned until they are supported on vLLM V1.")
# Encoder-decoder model requires special mapping of
# input prompts to encoder & decoder
return self._process_encoder_decoder_prompt(
Expand All @@ -768,6 +782,7 @@ def preprocess(
request_id=request_id,
lora_request=lora_request,
prompt_adapter_request=prompt_adapter_request,
return_mm_hashes=return_mm_hashes,
)

async def preprocess_async(
Expand All @@ -776,9 +791,13 @@ async def preprocess_async(
request_id: str,
lora_request: Optional[LoRARequest] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
return_mm_hashes: bool = False,
) -> ProcessorInputs:
"""Async version of :meth:`preprocess`."""
if self.model_config.is_encoder_decoder:
assert not return_mm_hashes, (
"Multimodal hashes for encoder-decoder models should not be ",
"returned until they are supported on vLLM V1.")
# Encoder-decoder model requires special mapping of
# input prompts to encoder & decoder
return await self._process_encoder_decoder_prompt_async(
Expand All @@ -796,4 +815,5 @@ async def preprocess_async(
request_id=request_id,
lora_request=lora_request,
prompt_adapter_request=prompt_adapter_request,
return_mm_hashes=return_mm_hashes,
)
4 changes: 3 additions & 1 deletion vllm/model_executor/models/llava.py
Original file line number Diff line number Diff line change
Expand Up @@ -767,6 +767,7 @@ def apply(
prompt: Union[str, list[int]],
mm_data: MultiModalDataDict,
hf_processor_mm_kwargs: Mapping[str, object],
return_mm_hashes: bool = False,
) -> MultiModalInputs:
hf_config = self.info.get_hf_config()
image_token_id = hf_config.image_token_index
Expand All @@ -777,7 +778,8 @@ def apply(
image_height=-1,
)

result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
return_mm_hashes)

mm_items = self._to_mm_items(mm_data)
mm_item_counts = mm_items.get_all_counts()
Expand Down
4 changes: 3 additions & 1 deletion vllm/model_executor/models/minicpmv.py
Original file line number Diff line number Diff line change
Expand Up @@ -780,6 +780,7 @@ def apply(
prompt: Union[str, List[int]],
mm_data: MultiModalDataDict,
hf_processor_mm_kwargs: Mapping[str, object],
return_mm_hashes: bool = False,
) -> MultiModalInputs:
supported_mm_modalities = self.info.get_supported_mm_modalities()
if isinstance(prompt, list):
Expand All @@ -791,7 +792,8 @@ def apply(
[index for index, m in enumerate(matches) if m == modality])
for modality in supported_mm_modalities
}
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
return_mm_hashes)
# Exclude <image_id>x</image_id> from placeholders
if "image" in result["mm_placeholders"] and \
self.info.get_model_version() == (2, 6):
Expand Down
4 changes: 3 additions & 1 deletion vllm/model_executor/models/mllama.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,8 +175,10 @@ def apply(
prompt: Union[str, list[int]],
mm_data: MultiModalDataDict,
hf_processor_mm_kwargs: Mapping[str, object],
return_mm_hashes: bool = False,
) -> MultiModalEncDecInputs:
mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
return_mm_hashes)

# Check that the number of image tokens in the decoder prompt matches
# the number of images provided in mm_data
Expand Down
1 change: 1 addition & 0 deletions vllm/model_executor/models/prithvi_geospatial_mae.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ def apply(
prompt: Union[str, list[int]],
mm_data: MultiModalDataDict,
hf_processor_mm_kwargs: Mapping[str, object],
return_mm_hashes: bool = False,
) -> MultiModalInputs:
mm_kwargs = {}

Expand Down
8 changes: 5 additions & 3 deletions vllm/multimodal/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
from typing_extensions import assert_never

import vllm.envs as envs
from vllm.inputs import InputProcessingContext
from vllm.logger import init_logger
from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens,
Expand Down Expand Up @@ -1435,6 +1434,7 @@ def apply(
prompt: Union[str, list[int]],
mm_data: MultiModalDataDict,
hf_processor_mm_kwargs: Mapping[str, object],
return_mm_hashes: bool = False,
) -> MultiModalInputs:
"""
Process multi-modal inputs to be used in vLLM.
Expand All @@ -1451,11 +1451,11 @@ def apply(
"""
mm_items = self._to_mm_items(mm_data)

# Create MM hashes (only used in V1)
# Create MM hashes to be returned (only used in V1)
# TODO: Use these hash keys for caching operations in apply_hf_processor
# instead of rehashing.

if envs.VLLM_USE_V1:
if return_mm_hashes:
model_id = self.info.model_id
mm_hashes = {
modality: [
Expand Down Expand Up @@ -1554,6 +1554,7 @@ def apply(
prompt: Union[str, list[int]],
mm_data: MultiModalDataDict,
hf_processor_mm_kwargs: Mapping[str, object],
return_mm_hashes: bool = False,
) -> MultiModalEncDecInputs:
"""
Process multi-modal inputs to be used in vLLM.
Expand All @@ -1567,6 +1568,7 @@ def apply(
encoder_prompt,
mm_data,
hf_processor_mm_kwargs,
return_mm_hashes,
)

tokenizer = self.info.get_tokenizer()
Expand Down
1 change: 1 addition & 0 deletions vllm/v1/engine/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ def process_inputs(
request_id=request_id,
lora_request=lora_request,
prompt_adapter_request=prompt_adapter_request,
return_mm_hashes=self.use_hash,
)
eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)

Expand Down

0 comments on commit ec79b67

Please sign in to comment.