[VLM] merged multimodal processor and V1 support for idefics3 (#12660)

Signed-off-by: Isotr0py <[email protected]> Co-authored-by: Cyrus Leung <[email protected]>
vllm-project · Feb 4, 2025 · 815079d · 815079d
1 parent 18a88fc
commit 815079d
Show file tree

Hide file tree

Showing 7 changed files with 315 additions and 457 deletions.
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
@@ -733,7 +733,7 @@ See [this page](#generative-models) for more information on how to use generativ
   * `HuggingFaceM4/Idefics3-8B-Llama3` etc.
   * ✅︎
   *
-  *
+  * ✅︎
 - * `InternVLChatModel`
   * InternVL 2.5, Mono-InternVL, InternVL 2.0
   * T + I<sup>E+</sup>

diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
@@ -254,14 +254,14 @@
         patch_hf_runner=model_utils.h2ovl_patch_hf_runner,
     ),
     "idefics3": VLMTestInfo(
-        models=["HuggingFaceM4/Idefics3-8B-Llama3"],
+        models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
         prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
         img_idx_to_prompt=lambda idx: "<image>",
         max_model_len=8192,
         max_num_seqs=2,
         auto_cls=AutoModelForVision2Seq,
-        marks=[large_gpu_mark(min_gb=48)],
+        hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
     ),
     "intern_vl": VLMTestInfo(
         models=[

diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -192,6 +192,14 @@ def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput,
     return output_ids, output_str, out_logprobs
 
 
+def idefics3_trunc_hf_output(hf_output: RunnerOutput,
+                             model: str) -> RunnerOutput:
+    output_ids, output_str, out_logprobs = hf_output
+    if output_str.endswith("<end_of_utterance>"):
+        output_str = output_str.split("<end_of_utterance>")[0]
+    return output_ids, output_str, out_logprobs
+
+
 def minicpmv_trunc_hf_output(hf_output: RunnerOutput,
                              model: str) -> RunnerOutput:
     output_ids, output_str, out_logprobs = hf_output

diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
@@ -149,6 +149,7 @@ def _test_processing_correctness(
     "adept/fuyu-8b",
     "h2oai/h2ovl-mississippi-800m",
     "OpenGVLab/InternVL2-1B",
+    "HuggingFaceM4/Idefics3-8B-Llama3",
     "llava-hf/llava-1.5-7b-hf",
     "llava-hf/llava-v1.6-mistral-7b-hf",
     "llava-hf/LLaVA-NeXT-Video-7B-hf",

diff --git a/tests/models/multimodal/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py
@@ -1,177 +1,64 @@
 # SPDX-License-Identifier: Apache-2.0
 """Tests for Idefics3's multimodal preprocessing kwargs."""
-from typing import Optional
-
 import pytest
-import torch
-from transformers import AutoImageProcessor, AutoTokenizer
+from transformers import Idefics3Config
 
-from vllm.inputs import InputContext, token_inputs
-from vllm.multimodal import MultiModalRegistry
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.utils import cached_get_tokenizer
 
 from ....conftest import _ImageAssets
 from ...utils import build_model_context
 
 models = ["HuggingFaceM4/Idefics3-8B-Llama3"]
 
 
-# Wrap lazy imports to avoid initializing CUDA during test collection
-@pytest.fixture()
-def input_processor_for_idefics3():
-    from vllm.model_executor.models.idefics3 import (
-        input_processor_for_idefics3)
-    return input_processor_for_idefics3
-
-
-@pytest.fixture()
-def dummy_data_for_idefics3():
-    from vllm.model_executor.models.idefics3 import dummy_data_for_idefics3
-    return dummy_data_for_idefics3
-
-
-@pytest.fixture()
-def get_max_idefics3_image_tokens():
-    from vllm.model_executor.models.idefics3 import (
-        get_max_idefics3_image_tokens)
-    return get_max_idefics3_image_tokens
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("longest_edge", [None, 168, 336, 400, 2 * 336])
-def test_input_mapper_override(model: str, image_assets: _ImageAssets,
-                               longest_edge: Optional[int]):
-    """Ensure that the [default] input mapper handles size properly."""
-
-    mm_processor_kwargs = {
-        "size": {
-            "longest_edge": longest_edge
-        }
-    } if longest_edge is not None else {}
-    ctx = build_model_context(
-        model_name=model,
-        tokenizer_name=model,
-        trust_remote_code=True,
-        mm_processor_kwargs=mm_processor_kwargs,
-    )
-
-    hf_processor = AutoImageProcessor.from_pretrained(model,
-                                                      trust_remote_code=True,
-                                                      **mm_processor_kwargs)
-
-    mm_registry = MultiModalRegistry()
-    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
-
-    image = image_assets[0].pil_image
-    hf_result = hf_processor.preprocess(
-        image,
-        return_tensors="pt",
-    )
-
-    vllm_result = mm_registry.map_input(
-        ctx.model_config,
-        {"image": image},
-    )
-
-    assert torch.all(hf_result["pixel_values"] == vllm_result["pixel_values"])
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("longest_edge, expected_max_tokens", [
-    (None, 2873),
-    (168, 169),
-    (336, 169),
-    (400, 338),
-    (672, 338),
-])
-def test_max_tokens_override(get_max_idefics3_image_tokens, model: str,
-                             longest_edge: Optional[int],
-                             expected_max_tokens: int):
-    """Ensure get_max_idefics3_image_tokens handles mm_processor_kwargs."""
-    size = {"longest_edge": longest_edge} if longest_edge is not None else None
-    ctx = build_model_context(
-        model_name=model,
-        tokenizer_name=model,
-        trust_remote_code=True,
-        mm_processor_kwargs=None,
-    )
-
-    actual_max_tokens = get_max_idefics3_image_tokens(
-        ctx=InputContext(ctx.model_config),
-        size=size,
-    )
-
-    assert expected_max_tokens == actual_max_tokens
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("longest_edge, toks_per_img, num_imgs", [
-    (168, 169, 1),
-    (168, 169, 2),
-    (400, 338, 1),
-    (400, 338, 2),
-])
-def test_dummy_data_override(dummy_data_for_idefics3, model: str,
-                             longest_edge: int, toks_per_img: int,
-                             num_imgs: int):
-    """Ensure dummy_data_for_idefics3 handles num_crops properly."""
-    # Same as the previous test - don't initialize mm_processor_kwargs
-    # in this test and assume that the kwargs will be correctly expanded by
-    # the partial when calling the dummy data func.
-    size = {"longest_edge": longest_edge} if longest_edge is not None else None
-    ctx = build_model_context(
-        model_name=model,
-        tokenizer_name=model,
-        trust_remote_code=True,
-        mm_processor_kwargs=None,
-    )
-
-    dummy_data = dummy_data_for_idefics3(
-        ctx=ctx,
-        seq_len=8192,  # Should be bigger than num_imgs * toks_per_img
-        mm_counts={"image": num_imgs},
-        size=size)
-    sequence_data = dummy_data.seq_data
-    # Ensure we have the right number of placeholders per size
-    image_token_id = ctx.get_hf_config().image_token_id
-    img_tok_count = sequence_data.get_token_ids().count(image_token_id)
-    assert img_tok_count == toks_per_img * num_imgs
-
-
 @pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("longest_edge,expected_toks_per_img,num_imgs", [
-    (336, 169 * (1**2 + 1), 1),
-    (336, 169 * (1**2 + 1), 2),
-    (400, 169 * (2**2 + 1), 1),
-    (400, 169 * (2**2 + 1), 2),
-])
-def test_input_processor_override(input_processor_for_idefics3,
-                                  image_assets: _ImageAssets, model: str,
-                                  longest_edge: int,
-                                  expected_toks_per_img: int, num_imgs: int):
+# yapf: disable
+@pytest.mark.parametrize(
+    ("mm_processor_kwargs", "expected_toks_per_img"),
+    [
+        ({"size": {"longest_edge": 364}}, 169),
+        ({"size": {"longest_edge": 728}}, 169 * (2**2 + 1)),
+    ])
+# yapf: enable
+@pytest.mark.parametrize("num_imgs", [1, 2])
+def test_processor_override(image_assets: _ImageAssets, model: str,
+                            mm_processor_kwargs: dict[str, object],
+                            expected_toks_per_img: int, num_imgs: int):
     """Ensure input_processor_for_idefics3 handles num_crops properly."""
     # Same as the previous test - don't initialize mm_processor_kwargs
     # in this test and assume that the kwargs will be correctly expanded by
     # the partial when calling the custom input processor.
-    size = {"longest_edge": longest_edge} if longest_edge is not None else None
     ctx = build_model_context(
         model_name=model,
         tokenizer_name=model,
         trust_remote_code=True,
         mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
     )
+    tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer)
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=tokenizer,
+    )
+    hf_processor = processor.info.get_hf_processor(**mm_processor_kwargs)
 
     # Build the image str / prompt based on the number of images we pass
-    tokenizer = AutoTokenizer.from_pretrained(model)
     placeholders = "<image>" if num_imgs == 1 else "\n".join(
         f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
     prompt = f"<|begin_of_text|>User:{placeholders}\n<end_of_utterance>\nAssistant:"  # noqa: E501
-    images = [image_assets[0].pil_image.resize((336 * 4, 336 * 4))] * num_imgs
-
-    inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt),
-                          prompt=prompt,
-                          multi_modal_data={"image": images})
 
-    processed_inputs = input_processor_for_idefics3(ctx, inputs, size=size)
+    # Build mm_data
+    image_size = ctx.get_hf_config(Idefics3Config).vision_config.image_size
+    dummy_image_size = (image_size * 4, image_size * 4)
+    dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
+    mm_data = {"image": [dummy_image] * num_imgs}
+
+    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
+    # Ensure the placeholders format are correct
+    hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
+    assert processed_inputs["prompt_token_ids"] == hf_processed_inputs[
+        "input_ids"][0]
 
     # Ensure we have the right number of placeholders per num_crops size
     image_token_id = ctx.get_hf_config().image_token_id

diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
@@ -31,6 +31,17 @@
 P = TypeVar("P", bound=ProcessorMixin, default=ProcessorMixin)
 
 
+class HashableDict(dict):
+    """
+    A dictionary that can be hashed by lru_cache.
+    """
+
+    # NOTE: pythonic dict is not hashable,
+    # we override on it directly for simplicity
+    def __hash__(self) -> int:  # type: ignore[override]
+        return hash(frozenset(self.items()))
+
+
 @dataclass(frozen=True)
 class InputContext:
     """
@@ -104,6 +115,13 @@ def get_hf_processor(
         if isinstance(typ, type):
             merged_kwargs["processor_cls"] = typ
 
+        # NOTE: Pythonic dict is not hashable and will raise unhashable type
+        # error when calling `cached_get_processor`, therefore we need to
+        # wrap it to a hashable dict.
+        for key, value in merged_kwargs.items():
+            if isinstance(value, dict):
+                merged_kwargs[key] = HashableDict(value)
+
         hf_processor = cached_get_processor(
             self.model_config.model,
             trust_remote_code=self.model_config.trust_remote_code,