diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 8cdc663a0320f..e59150cdd3b83 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -745,7 +745,7 @@ See [this page](#generative-models) for more information on how to use generativ
- `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc.
- ✅︎
- ✅︎
- -
+ - ✅︎
* - `Qwen2AudioForConditionalGeneration`
- Qwen2-Audio
- T + A+
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index fe5b733c750a8..b575ec6acbef3 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -16,7 +16,6 @@
def _test_processing_correctness(
model_id: str,
- modalities: dict[str, bool],
hit_rate: float,
num_batches: int,
simplify_rate: float,
@@ -25,11 +24,6 @@ def _test_processing_correctness(
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
- limit_mm_per_prompt = {
- modality: 3 if supports_multi else 1
- for modality, supports_multi in modalities.items()
- }
-
model_config = ModelConfig(
model_id,
task="auto",
@@ -40,18 +34,29 @@ def _test_processing_correctness(
dtype="float16",
revision=None,
hf_overrides=model_info.hf_overrides,
- limit_mm_per_prompt=limit_mm_per_prompt,
)
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
ctx = InputProcessingContext(
model_config,
- tokenizer=cached_get_tokenizer(model_config.tokenizer),
+ tokenizer=cached_get_tokenizer(
+ model_config.tokenizer,
+ trust_remote_code=model_info.trust_remote_code,
+ ),
)
# Ensure that it can fit all of the data
cache = ProcessingCache(capacity=1 << 30)
+ processing_info = factories.info(ctx)
+ supported_mm_limits = processing_info.get_supported_mm_limits()
+ limit_mm_per_prompt = {
+ modality: 3 if limit is None else limit
+ for modality, limit in supported_mm_limits.items()
+ }
+
+ model_config.get_multimodal_config().limit_per_prompt = limit_mm_per_prompt
+
baseline_processor = factories.build_processor(ctx, cache=None)
cached_processor = factories.build_processor(ctx, cache=cache)
dummy_inputs = baseline_processor.dummy_inputs
@@ -82,8 +87,8 @@ def _test_processing_correctness(
mm_data = {
k:
[(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
- for _ in range(rng.randint(limit_mm_per_prompt[k]))]
- for k in modalities
+ for _ in range(rng.randint(limit))]
+ for k, limit in limit_mm_per_prompt.items()
}
mm_counts = {k: len(vs) for k, vs in mm_data.items()}
@@ -135,21 +140,22 @@ def _test_processing_correctness(
# yapf: disable
# True if the model supports multiple data items of the modality per request
-@pytest.mark.parametrize(("model_id", "modalities"), [
- ("rhymes-ai/Aria", {"image": True}),
- ("Salesforce/blip2-opt-2.7b", {"image": False}),
- ("facebook/chameleon-7b", {"image": False}),
- ("deepseek-ai/deepseek-vl2-tiny", {"image": True}),
- ("adept/fuyu-8b", {"image": False}),
- ("llava-hf/llava-1.5-7b-hf", {"image": True}),
- ("llava-hf/llava-v1.6-mistral-7b-hf", {"image": True}),
- ("llava-hf/LLaVA-NeXT-Video-7B-hf", {"video": False}),
- ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", {"image": True, "video": True}), # noqa: E501
- ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}),
- ("mistral-community/pixtral-12b", {"image": True}),
- ("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}),
- ("Qwen/Qwen2-Audio-7B-Instruct", {"audio": True}),
- ("fixie-ai/ultravox-v0_3", {"audio": True}),
+@pytest.mark.parametrize("model_id", [
+ "rhymes-ai/Aria",
+ "Salesforce/blip2-opt-2.7b",
+ "facebook/chameleon-7b",
+ "deepseek-ai/deepseek-vl2-tiny",
+ "adept/fuyu-8b",
+ "llava-hf/llava-1.5-7b-hf",
+ "llava-hf/llava-v1.6-mistral-7b-hf",
+ "llava-hf/LLaVA-NeXT-Video-7B-hf",
+ "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
+ "TIGER-Lab/Mantis-8B-siglip-llama3",
+ "mistral-community/pixtral-12b",
+ "Qwen/Qwen-VL-Chat",
+ "Qwen/Qwen2-VL-2B-Instruct",
+ "Qwen/Qwen2-Audio-7B-Instruct",
+ "fixie-ai/ultravox-v0_3",
])
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
@pytest.mark.parametrize("num_batches", [32])
@@ -157,14 +163,12 @@ def _test_processing_correctness(
# yapf: enable
def test_processing_correctness(
model_id: str,
- modalities: dict[str, bool],
hit_rate: float,
num_batches: int,
simplify_rate: float,
):
_test_processing_correctness(
model_id,
- modalities,
hit_rate=hit_rate,
num_batches=num_batches,
simplify_rate=simplify_rate,
@@ -172,16 +176,13 @@ def test_processing_correctness(
# yapf: disable
-@pytest.mark.parametrize(("model_id", "modalities"), [
- ("microsoft/Phi-3-vision-128k-instruct", {"image": True}),
-])
+@pytest.mark.parametrize("model_id", ["microsoft/Phi-3-vision-128k-instruct"])
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
@pytest.mark.parametrize("num_batches", [32])
@pytest.mark.parametrize("simplify_rate", [1.0])
# yapf: enable
def test_processing_correctness_phi3v(
model_id: str,
- modalities: dict[str, bool],
hit_rate: float,
num_batches: int,
simplify_rate: float,
@@ -195,7 +196,6 @@ def test_processing_correctness_phi3v(
_test_processing_correctness(
model_id,
- modalities,
hit_rate=hit_rate,
num_batches=num_batches,
simplify_rate=simplify_rate,
diff --git a/tests/models/multimodal/processing/test_qwen.py b/tests/models/multimodal/processing/test_qwen.py
deleted file mode 100644
index af0ace711ba3e..0000000000000
--- a/tests/models/multimodal/processing/test_qwen.py
+++ /dev/null
@@ -1,144 +0,0 @@
-"""Tests for Qwen's multimodal preprocessing kwargs."""
-from typing import Dict, List, Union
-
-import pytest
-import torch
-from PIL.Image import Image
-
-from vllm.inputs import InputContext, token_inputs
-from vllm.multimodal import MultiModalKwargs
-from vllm.multimodal.utils import cached_get_tokenizer
-
-from ....conftest import IMAGE_ASSETS
-from ...utils import build_model_context
-
-### Multimodal preprocessing tests
-SAMPLE_IMAGE = IMAGE_ASSETS[0].pil_image
-# These values are specific to Qwen-VL/Chat; we can get these from the model
-# config also, but they are hardcoded here to keep the parameterize/fixtures
-# easy to read.
-IMG_START_ID = 151857
-IMG_END_ID = 151858
-IMG_PAD_ID = 151859
-TOKS_PER_IMG = 256
-VIS_ENC_DIM = 4096
-IMG_SIZE = 448
-
-
-@pytest.fixture()
-def input_mapper_for_qwen():
- # Lazy import to avoid initializing CUDA during test collection
- from vllm.model_executor.models.qwen import input_mapper_for_qwen
- return input_mapper_for_qwen
-
-
-@pytest.fixture()
-def input_processor_for_qwen():
- # Lazy import to avoid initializing CUDA during test collection
- from vllm.model_executor.models.qwen import input_processor_for_qwen
- return input_processor_for_qwen
-
-
-@pytest.fixture()
-def qwen_vl_context() -> InputContext:
- """Get an InputContext for Qwen-VL."""
- return build_model_context(model_name="Qwen/Qwen-VL",
- trust_remote_code=True)
-
-
-# Happy path tests for single/multi-image scenarios for the multimodal
-# input processor and mapper, respectively
-@pytest.mark.parametrize("num_images", [1, 2])
-def test_input_processor_valid_mm_data(input_processor_for_qwen,
- qwen_vl_context: InputContext,
- num_images: int):
- """Happy cases for image inputs to Qwen's multimodal input processor."""
- prompt = "".join(
- [f"Picture {num}:
\n" for num in range(1, num_images + 1)])
- inputs = token_inputs(
- prompt=prompt,
- # When processing multimodal data for a multimodal model, the qwen
- # input processor will overwrite the provided prompt_token_ids with
- # the image prompts
- prompt_token_ids=[],
- multi_modal_data={"image": torch.rand(num_images, TOKS_PER_IMG, 4096)},
- )
- proc_inputs = input_processor_for_qwen(qwen_vl_context, inputs)
- assert isinstance(proc_inputs, dict)
-
- # Each image should have one start / stop and a fixed context of 256
- proc_tokens = proc_inputs["prompt_token_ids"]
- assert proc_tokens.count(IMG_START_ID) == num_images
- assert proc_tokens.count(IMG_END_ID) == num_images
- assert proc_tokens.count(IMG_PAD_ID) == num_images * TOKS_PER_IMG
-
-
-@pytest.mark.parametrize(
- "img_data,expected_shape",
- [
- # single / multi-image
- (SAMPLE_IMAGE, (1, 3, IMG_SIZE, IMG_SIZE)),
- (2 * [SAMPLE_IMAGE], (2, 3, IMG_SIZE, IMG_SIZE)),
- # single / multi-image embeddings
- (torch.rand(
- (TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
- (torch.rand(
- (1, TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
- (torch.rand(
- (2, TOKS_PER_IMG, VIS_ENC_DIM)), (2, TOKS_PER_IMG, VIS_ENC_DIM)),
- ])
-def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
- qwen_vl_context: InputContext,
- img_data: Union[torch.Tensor, List[Image],
- Image],
- expected_shape: List[int]):
- """Happy cases for image inputs to Qwen's multimodal input mapper."""
- mapped_img_data = input_mapper_for_qwen(qwen_vl_context, img_data)
- # Ensure that we get the appropriately shaped pixel_values
- # for images and image embeddings, respectively.
- assert isinstance(mapped_img_data, MultiModalKwargs)
- assert "pixel_values" in mapped_img_data
- assert mapped_img_data["pixel_values"].shape == expected_shape
-
-
-# Sad path tests for the multimodal input processor and mapper, respectively
-@pytest.mark.parametrize("mm_data", [
- {
- "image": torch.rand(5)
- },
- {
- "image": torch.rand((5, 5, 5, 5, 5))
- },
-])
-def test_input_processor_invalid_mm_data(input_processor_for_qwen,
- qwen_vl_context: InputContext,
- mm_data: Dict[str, torch.Tensor]):
- """Test sad cases validated in Qwen's multimodal input processor."""
- tokenizer = cached_get_tokenizer(qwen_vl_context.model_config.tokenizer,
- trust_remote_code=True)
- prompt = "Picture 1:
\n"
- prompt_token_ids = tokenizer.encode(prompt)
- inputs = token_inputs(prompt=prompt,
- prompt_token_ids=prompt_token_ids,
- multi_modal_data=mm_data)
- # Should fail since we have too many or too few dimensions for embeddings
- with pytest.raises(ValueError):
- input_processor_for_qwen(qwen_vl_context, inputs)
-
-
-@pytest.mark.parametrize(
- "img_data",
- [
- # Wrong context length
- torch.rand((1, TOKS_PER_IMG + 10, VIS_ENC_DIM)),
- # Wrong visual encoder output size
- torch.rand((1, TOKS_PER_IMG, VIS_ENC_DIM + 10)),
- ])
-def test_input_mapper_invalid_mm_data(
- input_mapper_for_qwen,
- qwen_vl_context: InputContext,
- img_data: Union[torch.Tensor, List[Image], Image],
-):
- """Sad cases validated in Qwen VL's multimodal input mapper."""
- with pytest.raises(ValueError):
- input_mapper_for_qwen(qwen_vl_context, img_data)
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 1345b381f0a99..86a9d3089c3ee 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -4,26 +4,28 @@
# LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
"""Inference-only QWen model compatible with HuggingFace weights."""
+import copy
import math
import re
-from functools import partial
-from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping,
- Optional, Set, Tuple, TypedDict, Union)
+import unicodedata
+from functools import lru_cache, partial
+from typing import (AbstractSet, Any, Callable, Collection, Dict, Iterable,
+ List, Literal, Mapping, Optional, Set, Tuple, TypedDict,
+ Union)
-import numpy as np
import torch
-from PIL import Image
from torch import nn
from torchvision import transforms
from torchvision.transforms import InterpolationMode
-from transformers import PretrainedConfig
+from transformers import (BatchFeature, PretrainedConfig, PreTrainedTokenizer,
+ TensorType)
+from transformers.image_utils import ImageInput
+from transformers.tokenization_utils_base import TextInput
from vllm.attention import Attention, AttentionMetadata
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
- InputContext, token_inputs)
from vllm.logger import init_logger
from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
from vllm.model_executor.layers.layernorm import RMSNorm
@@ -42,15 +44,20 @@
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
-from vllm.multimodal.utils import cached_get_tokenizer
-from vllm.sequence import IntermediateTensors, SequenceData
-from vllm.utils import is_list_of
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
+ NestedTensors)
+from vllm.multimodal.parse import MultiModalDataItems
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+ BaseProcessingInfo, PromptReplacement,
+ PromptReplacementDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.sequence import IntermediateTensors
from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
from .utils import (flatten_bn, is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers,
- maybe_prefix)
+ maybe_prefix, merge_multimodal_embeddings)
logger = init_logger(__name__)
@@ -353,8 +360,10 @@ def __init__(self,
self.ln_post = norm_layer(output_dim)
self.proj = nn.Parameter(
(output_dim**-0.5) * torch.randn(output_dim, output_dim))
+
self.image_start_id = image_start_id
self.image_end_id = image_start_id + 1
+ self.image_pad_id = image_start_id + 2
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = x.to(
@@ -383,21 +392,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
return x
- def get_image_positions(self,
- input_ids: torch.Tensor) -> Optional[torch.Tensor]:
- """Given the input IDs, extracts start/stop points corresponding to
- images.
-
- args:
- Returns:
- Optional torch tensor corresponding to start/stop pairs of images.
- """
- if torch.any(input_ids == self.image_start_id):
- bos_pos = torch.where(input_ids == self.image_start_id)
- eos_pos = torch.where(input_ids == self.image_end_id)
- return torch.stack((bos_pos[0], eos_pos[0]), dim=1)
- return None
-
class QWenMLP(nn.Module):
"""MLP for the language component of the Qwen model, which contains a
@@ -579,9 +573,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
self.make_empty_intermediate_tensors = (
make_empty_intermediate_tensors_factory(
["hidden_states", "residual"], config.hidden_size))
- self.visual = VisionTransformer(**config.visual,
- quant_config=quant_config) if hasattr(
- config, "visual") else None
+
+ if (vision_config := getattr(config, "visual", None)):
+ self.visual = VisionTransformer(**vision_config,
+ quant_config=quant_config)
+ else:
+ self.visual = None
def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
return self.wte(input_ids)
@@ -593,38 +590,13 @@ def forward(
kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata,
intermediate_tensors: Optional[IntermediateTensors],
- pixel_values: Optional[QwenImageInputs],
inputs_embeds: Optional[torch.Tensor] = None,
) -> Union[torch.Tensor, IntermediateTensors]:
- img_pos = None
- # If pixel / visual embeddings are provided, this is a visual model
- if pixel_values is not None and self.visual is not None:
- if pixel_values["type"] != "image_embeds":
- image_embeds = self.visual(pixel_values["data"])
- else:
- image_embeds = pixel_values["data"]
-
- # features should be of shape (# images, 256, hidden_dim)
- img_pos = self.visual.get_image_positions(input_ids)
- if isinstance(
- img_pos,
- np.ndarray) and img_pos.shape[0] != image_embeds.shape[0]:
- raise ValueError(
- f"Number of placeholders: {img_pos.shape[0]} "
- f"does not match number of images {image_embeds.shape[0]}."
- )
-
if get_pp_group().is_first_rank:
if inputs_embeds is not None:
hidden_states = inputs_embeds
else:
hidden_states = self.get_input_embeddings(input_ids)
- hidden_states = self.wte(input_ids)
- # Merge the image embeddings into the hidden states if actually have
- # visual features and the corresponding image tokens
- if img_pos is not None:
- for idx, (img_bos, img_eos) in enumerate(img_pos):
- hidden_states[img_bos + 1:img_eos] = image_embeds[idx]
residual = None
else:
assert intermediate_tensors is not None
@@ -648,159 +620,9 @@ def forward(
return hidden_states
-def get_image_text(image_num: int, padding: bool) -> str:
- """Retrieves a placeholder text that when tokenized, will be expanded with
- image pads.
-
- Args:
- image_num: The number of the image that we want a text prompt for.
- Images should be indexed starting at 1.
- padding: Whether or not padding should be manually added.
-
- Returns:
- Text placeholder prompt for the image being considered.
- """
- image_start = f"Picture {image_num}: {IMG_START}"
- image_end = f"{IMG_END}\n"
- if not padding:
- return f"{image_start}{image_end}"
- return f"{image_start}{MAX_QWEN_IMG_TOKENS * IMG_PAD}{image_end}"
-
-
-def input_processor_for_qwen(ctx: InputContext,
- inputs: DecoderOnlyInputs) -> DecoderOnlyInputs:
- """Processes the inputs, which may or may not be multimodal.
- Multimodal inputs will only be processed if the model has a "visual"
- component in its model config, otherwise they'll be ignored.
-
- Args:
- ctx: Context of the loaded model.
- inputs: LLM inputs which may have a multi_modal_data attribute.
-
- Returns:
- If the model is language only or not multimodal inputs were provided,
- returns inputs unmodified. Otherwise, processes the multimodal
- images / image embeddings and adds the fixed-length image placeholders.
- """
- multi_modal_data = inputs.get("multi_modal_data")
-
- # Only process images if we have multimodal data and a visual config
- hf_config = ctx.get_hf_config()
- if (multi_modal_data is None or "image" not in multi_modal_data
- or not hasattr(hf_config, "visual")):
- return inputs
-
- prompt = inputs.get("prompt")
- prompt_token_ids = inputs["prompt_token_ids"]
- model_config = ctx.model_config
- tokenizer = cached_get_tokenizer(
- model_config.tokenizer,
- trust_remote_code=model_config.trust_remote_code)
- image_data = multi_modal_data["image"]
- if isinstance(image_data, torch.Tensor):
- num_dims = len(image_data.shape)
- if num_dims < 2 or num_dims > 3:
- raise ValueError(
- f"Expected img embeds to be have 3 dimensions, got {num_dims}")
- num_images = 1 if num_dims == 2 else image_data.shape[0]
- elif isinstance(image_data, Image.Image):
- num_images = 1
- elif is_list_of(image_data, Image.Image):
- num_images = len(image_data)
- else:
- raise TypeError(f"Invalid image type: {type(image_data)}")
-
- if prompt is None:
- prompt = tokenizer.decode(prompt_token_ids)
-
- # Drops anything between
/ tags; encoding with the tokenizer
- # will automatically add the image pads for the context.
- new_prompt, num_matched_images = re.subn(
- r"(Picture \d*:
).*?(<\/img>\n)",
- r"\1\2",
- prompt,
- )
-
- if num_matched_images != num_images:
- logger.warning(
- "Number of matched image placeholders %s doesn't match the number "
- "of expected images %s; check your placeholder formatting.",
- num_matched_images, num_images)
-
- new_prompt_token_ids = tokenizer.encode(new_prompt)
-
- return token_inputs(prompt=new_prompt,
- prompt_token_ids=new_prompt_token_ids,
- multi_modal_data=multi_modal_data)
-
-
-def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalKwargs:
- """Maps the input data to its MultiModalKwargs (if any).
-
- Args:
- ctx: Context of the loaded model.
- data: data potentially containing image/image embeddings to be mapped
- to pixel_values in .forward() for a visual QWenLMHeadModel model.
-
- Returns:
- MultiModalKwargs containing the stacked normalized images tensor or
- image embeddings.
- """
- # Early exit if we have provided an image to a language only Qwen model
- hf_config = ctx.get_hf_config()
- if not hasattr(hf_config, "visual"):
- logger.warning(
- "Images were provided but this model has no visual config; "
- "multimodal inputs will not be forwarded to the model.")
- return MultiModalKwargs()
-
- model_config = ctx.model_config
- tokenizer = cached_get_tokenizer(
- model_config.tokenizer,
- trust_remote_code=model_config.trust_remote_code)
-
- image_pair_tok = tokenizer.encode(IMG_START + IMG_END,
- add_special_tokens=False,
- return_tensors="pt").squeeze()
- image_start_id = image_pair_tok[0]
- image_end_id = image_pair_tok[-1]
- if (image_start_id + 1) != image_end_id:
- raise ValueError(
- f"Found image end ID {image_end_id}, but expected {IMG_START} + 1")
- if len(image_pair_tok) != (MAX_QWEN_IMG_TOKENS + 2):
- raise ValueError(
- f"Expected image context length of {MAX_QWEN_IMG_TOKENS}, "
- f"but got {image_pair_tok - 2}")
-
- hf_config = ctx.get_hf_config()
- image_size = hf_config.visual["image_size"]
- img_emb_size = hf_config.visual["output_dim"]
-
- if isinstance(data, torch.Tensor):
- # It's expected that our values have already been processed
- # by the visual transformer; shape is expected to be:
- # (# images, 256, hidden_size)
- if len(data.shape) == 2:
- # Assume only one image embed was provided; unsqueeze the extra dim
- data = data.unsqueeze(0)
- if len(data.shape) != 3 or data.shape[
- 1] != MAX_QWEN_IMG_TOKENS or data.shape[2] != img_emb_size:
- raise ValueError(
- "Expected image embeds to be a tensor of shape"
- f"[# images, {MAX_QWEN_IMG_TOKENS}, {img_emb_size}], but "
- f"received shape [{data.shape}]")
- pixel_values = data
- else:
- transform = build_normalization_transform(image_size)
- if not isinstance(data, (list, tuple)):
- data = [data]
- transformed_images = [transform(datum) for datum in data]
- pixel_values = torch.stack(transformed_images, dim=0)
- return MultiModalKwargs({"pixel_values": pixel_values})
-
-
def build_normalization_transform(image_size: int) -> transforms.Compose:
- """Builds a normalization transform which can be applied to one or
+ """
+ Build a normalization transform which can be applied to one or
more input images from which we want to extract visual features.
Args:
@@ -817,62 +639,251 @@ def build_normalization_transform(image_size: int) -> transforms.Compose:
])
-def dummy_data_for_qwen(
- ctx: InputContext,
- seq_len: int,
- mm_counts: Mapping[str, int],
-) -> DummyData:
- """Build dummy data for warming up Qwen models; this will only contain text
- matching the defaults for VLLM unless the model has a visual config.
+@lru_cache(maxsize=1)
+def _get_tokenizer_without_image_pad(
+ tokenizer: PreTrainedTokenizer) -> PreTrainedTokenizer:
+ """
+ The logic of adding image pad tokens should only be applied in
+ :class:`QWenVLProcessor`, so they are patched out here.
- Args:
- ctx: Context of the loaded model.
- seq_len: Number of tokens in the text sequence.
- mm_counts: multimodal data counts.
-
- Returns:
- Tuple containing sequential and multimodal data.
+ The definition of the wrapped tokenizer can be found here:
+ https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py
+ """
+ new_tokenizer = copy.deepcopy(tokenizer)
+
+ class TokenizerWithoutImagePad(tokenizer.__class__): # type: ignore
+
+ def tokenize(
+ self,
+ text: str,
+ allowed_special: Union[AbstractSet[str], str] = "all",
+ disallowed_special: Union[Collection[str], str] = (),
+ **kwargs,
+ ) -> list[Union[bytes, str]]:
+ text = unicodedata.normalize("NFC", text)
+
+ return [
+ self.decoder[t] for t in self.tokenizer.encode(
+ text,
+ allowed_special=allowed_special,
+ disallowed_special=disallowed_special,
+ )
+ ]
+
+ def _decode(
+ self,
+ token_ids: Union[int, List[int]],
+ skip_special_tokens: bool = False,
+ errors: Optional[str] = None,
+ **kwargs,
+ ) -> str:
+ if isinstance(token_ids, int):
+ token_ids = [token_ids]
+
+ return self.tokenizer.decode(
+ token_ids,
+ errors=errors or self.errors,
+ )
+
+ TokenizerWithoutImagePad.__name__ = \
+ f"{tokenizer.__class__.__name__}WithoutImagePad"
+
+ new_tokenizer.__class__ = TokenizerWithoutImagePad
+ return new_tokenizer
+
+
+class QWenVLProcessor:
+ """
+ This model doesn't define its own HF processor,
+ so we implement our own one here.
+
+ We call the wrapped tokenizer to automatically insert image pad tokens:
+ https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py#L245
+
+ The image processor is defined here:
+ https://huggingface.co/Qwen/Qwen-VL/blob/main/visual.py#L354
"""
- hf_config = ctx.get_hf_config()
-
- # The presence of a visual config indicates this is a multimodal model.
- # If we don't have it, the model is considered an LLM for warmup purposes.
- if not hasattr(hf_config, "visual"):
- seq_data = SequenceData.from_prompt_token_counts((0, seq_len))
- mm_data = None
- return DummyData(seq_data, mm_data)
-
- # We have a visual component - use images to warm up
- num_images = mm_counts["image"]
- model_config = ctx.model_config
- tokenizer = cached_get_tokenizer(
- model_config.tokenizer,
- trust_remote_code=model_config.trust_remote_code)
-
- # Build the image prompts with no imgpads; the tokenizer will add img pads
- image_prompt = ''.join(
- [get_image_text(idx, False) for idx in range(1, num_images + 1)])
- toks = tokenizer.encode(image_prompt, add_special_tokens=False)
-
- # Make sure we actually get the fixed context size per tok padding
- num_pads = toks.count(tokenizer.encode(IMG_PAD)[0])
- if num_pads != (num_images * MAX_QWEN_IMG_TOKENS):
- raise ValueError(
- f"Tokenized dummy data should encode {MAX_QWEN_IMG_TOKENS} pads"
- f" per image, but got {num_pads} pads for {num_images} image(s)"
- " in total. Are you using a qwen tokenizer?")
-
- # Ensure the number of tokens is at minimum the sequence length provided
- if len(toks) < seq_len:
- toks += [0] * (seq_len - len(toks))
-
- seq_data = SequenceData.from_seqs(toks)
-
- # Build the input images; width/height doesn't actually matter here since
- # the data will get resized and the # of tokens per image is constant
- image = Image.new("RGB", (224, 224), color=0)
- mm_data = {"image": image if num_images == 1 else [image] * num_images}
- return DummyData(seq_data, mm_data)
+
+ def __init__(
+ self,
+ config: PretrainedConfig,
+ tokenizer: PreTrainedTokenizer,
+ ) -> None:
+ super().__init__()
+
+ self.config = config
+ self.tokenizer = tokenizer
+
+ if hasattr(self.config, "visual"):
+ self.image_transform = build_normalization_transform(
+ config.visual["image_size"])
+ else:
+ self.image_transform = None
+
+ special_tokens: dict[str,
+ int] = tokenizer.special_tokens # type: ignore
+ self.img_start_id = special_tokens[IMG_START]
+ self.img_end_id = special_tokens[IMG_END]
+
+ def __call__(
+ self,
+ text: Optional[Union[TextInput, list[TextInput]]] = None,
+ images: Optional[Union[ImageInput, list[ImageInput]]] = None,
+ return_tensors: Optional[Union[str, TensorType]] = None,
+ ) -> BatchFeature:
+ if text is None:
+ text = []
+ if not isinstance(text, list):
+ text = [text]
+ if images is None:
+ images = []
+ if not isinstance(images, list):
+ images = [images]
+
+ text_inputs = self.tokenizer(text)
+
+ if len(images) == 0:
+ image_inputs = {}
+ else:
+ if self.image_transform is None:
+ raise ValueError("This model does not support image inputs")
+
+ pixel_values = [self.image_transform(image) for image in images]
+ image_inputs = {"pixel_values": torch.stack(pixel_values)}
+
+ return BatchFeature(
+ {
+ **text_inputs,
+ **image_inputs,
+ },
+ tensor_type=return_tensors,
+ )
+
+
+class QWenVLProcessingInfo(BaseProcessingInfo):
+
+ def get_tokenizer(self) -> PreTrainedTokenizer:
+ tokenizer = self.ctx.tokenizer
+ assert isinstance(tokenizer, PreTrainedTokenizer)
+
+ return _get_tokenizer_without_image_pad(tokenizer)
+
+ def get_hf_processor(self) -> QWenVLProcessor:
+ tokenizer = self.ctx.tokenizer
+ assert isinstance(tokenizer, PreTrainedTokenizer)
+
+ return QWenVLProcessor(self.get_hf_config(), tokenizer)
+
+ def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+ return {"image": None}
+
+ def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+ return {"image": self.get_num_image_tokens()}
+
+ def get_num_image_tokens(self) -> int:
+ return MAX_QWEN_IMG_TOKENS
+
+
+class QWenVLDummyInputsBuilder(BaseDummyInputsBuilder[QWenVLProcessingInfo]):
+
+ def get_dummy_processor_inputs(
+ self,
+ seq_len: int,
+ mm_counts: Mapping[str, int],
+ ) -> ProcessorInputs:
+ hf_config = self.info.get_hf_config()
+ if not hasattr(hf_config, "visual"):
+ return ProcessorInputs(prompt_text="", mm_data={})
+
+ vision_config = hf_config.visual
+
+ max_image_size = vision_config["image_size"]
+ num_images = mm_counts.get("image", 0)
+
+ mm_data = {
+ "image":
+ self._get_dummy_images(width=max_image_size,
+ height=max_image_size,
+ num_images=num_images)
+ }
+
+ return ProcessorInputs(
+ prompt_text="".join(f"Picture {i}: {IMG_START}{IMG_END}\n"
+ for i in range(1, num_images + 1)),
+ mm_data=mm_data,
+ )
+
+
+class QWenVLMultiModalProcessor(BaseMultiModalProcessor[QWenVLProcessingInfo]):
+
+ def _call_hf_processor(
+ self,
+ prompt: str,
+ mm_data: Mapping[str, object],
+ mm_kwargs: Mapping[str, object],
+ ) -> BatchFeature:
+ # Drops anything between
/ tags; encoding with the tokenizer
+ # will automatically add the image pads for the context.
+ prompt, num_matched_images = re.subn(
+ r"(Picture \d*:
).*?(<\/img>\n)",
+ r"\1\2",
+ prompt,
+ )
+
+ image_data = mm_data.get("images")
+ if image_data is not None:
+ assert isinstance(image_data, list)
+
+ num_images = len(image_data)
+ if num_matched_images != num_images:
+ logger.warning(
+ "Number of matched image placeholders %s doesn't match "
+ "the number of expected images %s; check your placeholder "
+ "formatting.", num_matched_images, num_images)
+
+ return super()._call_hf_processor(
+ prompt=prompt,
+ mm_data=mm_data,
+ mm_kwargs=mm_kwargs,
+ )
+
+ def _get_mm_fields_config(
+ self,
+ hf_inputs: BatchFeature,
+ hf_processor_mm_kwargs: Mapping[str, object],
+ ) -> Mapping[str, MultiModalFieldConfig]:
+ return dict(
+ pixel_values=MultiModalFieldConfig.batched("image"),
+ image_embeds=MultiModalFieldConfig.batched("image"),
+ )
+
+ def _get_prompt_replacements(
+ self,
+ mm_items: MultiModalDataItems,
+ hf_processor_mm_kwargs: Mapping[str, object],
+ out_mm_kwargs: MultiModalKwargs,
+ ) -> list[PromptReplacement]:
+ tokenizer = self.info.get_tokenizer()
+ special_tokens: dict[str,
+ int] = tokenizer.special_tokens # type: ignore
+
+ img_start_id = special_tokens[IMG_START]
+ img_end_id = special_tokens[IMG_END]
+ img_pad_id = special_tokens[IMG_PAD]
+
+ num_image_tokens = self.info.get_num_image_tokens()
+ image_tokens = [img_pad_id] * num_image_tokens
+
+ return [
+ PromptReplacement(
+ modality="image",
+ target=[img_start_id, img_end_id],
+ replacement=PromptReplacementDetails(
+ full=[img_start_id] + image_tokens + [img_end_id],
+ features=image_tokens,
+ ),
+ )
+ ]
class QWenBaseModel(nn.Module, SupportsPP, SupportsLoRA):
@@ -898,38 +909,77 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
self.make_empty_intermediate_tensors = (
self.transformer.make_empty_intermediate_tensors)
- def _get_image_input_type(
- self,
- pixel_values: Optional[torch.Tensor]) -> Optional[QwenImageInputs]:
- """Determines if the provided pixel_values are normalized pixel values
- or image embeddings.
+ def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+ h = w = self.config.visual["image_size"]
+ expected_dims = (3, h, w)
+ actual_dims = tuple(data.shape[1:])
- Args:
- pixel_values: Optional data to processed into visual embeddings.
+ if actual_dims != expected_dims:
+ expected_expr = ("batch_size", *map(str, expected_dims))
+ raise ValueError(
+ f"The expected shape of pixel values is {expected_expr}. "
+ f"You supplied {tuple(data.shape)}.")
+
+ return data
+
+ def _parse_and_validate_image_input(
+ self, **kwargs: object) -> Optional[QwenImageInputs]:
+ pixel_values = kwargs.pop("pixel_values", None)
+ image_embeds = kwargs.pop("image_embeds", None)
+
+ if pixel_values is not None:
+ if not isinstance(pixel_values, torch.Tensor):
+ raise ValueError("Incorrect type of pixel values. "
+ f"Got type: {type(pixel_values)}")
+
+ return QwenImagePixelInputs(
+ type="pixel_values",
+ data=self._validate_pixel_values(
+ flatten_bn(pixel_values, concat=True)),
+ )
+
+ if image_embeds is not None:
+ if not isinstance(image_embeds, torch.Tensor):
+ raise ValueError("Incorrect type of image embeddings. "
+ f"Got type: {type(image_embeds)}")
+
+ return QwenImageEmbeddingInputs(
+ type="image_embeds",
+ data=flatten_bn(image_embeds),
+ )
- Returns:
- None of the QwenImageInputs type used to determine whether or not
- the visual transformer needs to process the pixel_values.
- """
- if pixel_values is not None and self.transformer.visual is not None:
- pixel_values = flatten_bn(pixel_values)
- if len(pixel_values.shape) == 3 and pixel_values.shape[
- 1] == MAX_QWEN_IMG_TOKENS and pixel_values.shape[
- 2] == self.config.visual["output_dim"]:
- return QwenImageEmbeddingInputs(
- type="image_embeds",
- data=pixel_values,
- )
- else:
- # If we have the wrong shape, assume we still need to process
- return QwenImagePixelInputs(
- type="pixel_values",
- data=pixel_values,
- )
return None
- def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
- return self.transformer.get_input_embeddings(input_ids)
+ def _process_image_input(self,
+ image_input: QwenImageInputs) -> torch.Tensor:
+ if image_input["type"] == "image_embeds":
+ return image_input["data"]
+
+ assert self.transformer.visual is not None
+ return self.transformer.visual(image_input["data"])
+
+ def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+ image_input = self._parse_and_validate_image_input(**kwargs)
+ if image_input is None:
+ return None
+
+ vision_embeddings = self._process_image_input(image_input)
+ return vision_embeddings
+
+ def get_input_embeddings(
+ self,
+ input_ids: torch.Tensor,
+ multimodal_embeddings: Optional[NestedTensors] = None,
+ ) -> torch.Tensor:
+ inputs_embeds = self.transformer.get_input_embeddings(input_ids)
+
+ if multimodal_embeddings is not None:
+ assert self.transformer.visual is not None
+ inputs_embeds = merge_multimodal_embeddings(
+ input_ids, inputs_embeds, multimodal_embeddings,
+ self.transformer.visual.image_pad_id)
+
+ return inputs_embeds
def forward(
self,
@@ -938,18 +988,23 @@ def forward(
kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata,
intermediate_tensors: Optional[IntermediateTensors] = None,
- pixel_values: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
+ **kwargs: object,
) -> Union[torch.Tensor, IntermediateTensors]:
if intermediate_tensors is not None:
+ inputs_embeds = None
+
+ # NOTE: In v1, inputs_embeds is always generated at model runner, this
+ # condition is for v0 compatibility.
+ elif inputs_embeds is None:
+ vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+ inputs_embeds = self.get_input_embeddings(input_ids,
+ vision_embeddings)
input_ids = None
- pixel_values = None
- else:
- pixel_values = self._get_image_input_type(pixel_values)
hidden_states = self.transformer(input_ids, positions, kv_caches,
attn_metadata, intermediate_tensors,
- pixel_values, inputs_embeds)
+ inputs_embeds)
return hidden_states
def compute_logits(
@@ -1063,10 +1118,9 @@ def get_mm_mapping(self) -> MultiModelKeys:
tower_model="transformer.visual.transformer")
-@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_qwen)
-@MULTIMODAL_REGISTRY.register_max_image_tokens(MAX_QWEN_IMG_TOKENS)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_qwen)
+@MULTIMODAL_REGISTRY.register_processor(QWenVLMultiModalProcessor,
+ info=QWenVLProcessingInfo,
+ dummy_inputs=QWenVLDummyInputsBuilder)
class QWenLMHeadModel(QWenBaseModel, SupportsMultiModal, SupportsLoRA):
"""
QWenLMHeadModel is not only applicable to LLM but also to VL, which is not
@@ -1084,7 +1138,7 @@ def __new__(
cls,
vllm_config: VllmConfig,
prefix: str = "",
- ) -> None:
+ ) -> QWenBaseModel:
config = vllm_config.model_config.hf_config
# Initialize VL
if hasattr(config, "visual"):