Skip to content

Commit

Permalink
[VLM] Merged multi-modal processor and V1 support for Qwen-VL (vllm-p…
Browse files Browse the repository at this point in the history
…roject#12504)

Signed-off-by: DarkLight1337 <[email protected]>
Signed-off-by: Isotr0py <[email protected]>
  • Loading branch information
DarkLight1337 authored and Isotr0py committed Feb 2, 2025
1 parent e78b3d2 commit 99d1b38
Show file tree
Hide file tree
Showing 4 changed files with 387 additions and 477 deletions.
2 changes: 1 addition & 1 deletion docs/source/models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -745,7 +745,7 @@ See [this page](#generative-models) for more information on how to use generativ
- `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc.
- ✅︎
- ✅︎
-
- ✅︎
* - `Qwen2AudioForConditionalGeneration`
- Qwen2-Audio
- T + A<sup>+</sup>
Expand Down
64 changes: 32 additions & 32 deletions tests/models/multimodal/processing/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

def _test_processing_correctness(
model_id: str,
modalities: dict[str, bool],
hit_rate: float,
num_batches: int,
simplify_rate: float,
Expand All @@ -25,11 +24,6 @@ def _test_processing_correctness(
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")

limit_mm_per_prompt = {
modality: 3 if supports_multi else 1
for modality, supports_multi in modalities.items()
}

model_config = ModelConfig(
model_id,
task="auto",
Expand All @@ -40,18 +34,29 @@ def _test_processing_correctness(
dtype="float16",
revision=None,
hf_overrides=model_info.hf_overrides,
limit_mm_per_prompt=limit_mm_per_prompt,
)

model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
ctx = InputProcessingContext(
model_config,
tokenizer=cached_get_tokenizer(model_config.tokenizer),
tokenizer=cached_get_tokenizer(
model_config.tokenizer,
trust_remote_code=model_info.trust_remote_code,
),
)
# Ensure that it can fit all of the data
cache = ProcessingCache(capacity=1 << 30)

processing_info = factories.info(ctx)
supported_mm_limits = processing_info.get_supported_mm_limits()
limit_mm_per_prompt = {
modality: 3 if limit is None else limit
for modality, limit in supported_mm_limits.items()
}

model_config.get_multimodal_config().limit_per_prompt = limit_mm_per_prompt

baseline_processor = factories.build_processor(ctx, cache=None)
cached_processor = factories.build_processor(ctx, cache=cache)
dummy_inputs = baseline_processor.dummy_inputs
Expand Down Expand Up @@ -82,8 +87,8 @@ def _test_processing_correctness(
mm_data = {
k:
[(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
for _ in range(rng.randint(limit_mm_per_prompt[k]))]
for k in modalities
for _ in range(rng.randint(limit))]
for k, limit in limit_mm_per_prompt.items()
}

mm_counts = {k: len(vs) for k, vs in mm_data.items()}
Expand Down Expand Up @@ -135,53 +140,49 @@ def _test_processing_correctness(

# yapf: disable
# True if the model supports multiple data items of the modality per request
@pytest.mark.parametrize(("model_id", "modalities"), [
("rhymes-ai/Aria", {"image": True}),
("Salesforce/blip2-opt-2.7b", {"image": False}),
("facebook/chameleon-7b", {"image": False}),
("deepseek-ai/deepseek-vl2-tiny", {"image": True}),
("adept/fuyu-8b", {"image": False}),
("llava-hf/llava-1.5-7b-hf", {"image": True}),
("llava-hf/llava-v1.6-mistral-7b-hf", {"image": True}),
("llava-hf/LLaVA-NeXT-Video-7B-hf", {"video": False}),
("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", {"image": True, "video": True}), # noqa: E501
("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}),
("mistral-community/pixtral-12b", {"image": True}),
("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}),
("Qwen/Qwen2-Audio-7B-Instruct", {"audio": True}),
("fixie-ai/ultravox-v0_3", {"audio": True}),
@pytest.mark.parametrize("model_id", [
"rhymes-ai/Aria",
"Salesforce/blip2-opt-2.7b",
"facebook/chameleon-7b",
"deepseek-ai/deepseek-vl2-tiny",
"adept/fuyu-8b",
"llava-hf/llava-1.5-7b-hf",
"llava-hf/llava-v1.6-mistral-7b-hf",
"llava-hf/LLaVA-NeXT-Video-7B-hf",
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
"TIGER-Lab/Mantis-8B-siglip-llama3",
"mistral-community/pixtral-12b",
"Qwen/Qwen-VL-Chat",
"Qwen/Qwen2-VL-2B-Instruct",
"Qwen/Qwen2-Audio-7B-Instruct",
"fixie-ai/ultravox-v0_3",
])
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
@pytest.mark.parametrize("num_batches", [32])
@pytest.mark.parametrize("simplify_rate", [1.0])
# yapf: enable
def test_processing_correctness(
model_id: str,
modalities: dict[str, bool],
hit_rate: float,
num_batches: int,
simplify_rate: float,
):
_test_processing_correctness(
model_id,
modalities,
hit_rate=hit_rate,
num_batches=num_batches,
simplify_rate=simplify_rate,
)


# yapf: disable
@pytest.mark.parametrize(("model_id", "modalities"), [
("microsoft/Phi-3-vision-128k-instruct", {"image": True}),
])
@pytest.mark.parametrize("model_id", ["microsoft/Phi-3-vision-128k-instruct"])
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
@pytest.mark.parametrize("num_batches", [32])
@pytest.mark.parametrize("simplify_rate", [1.0])
# yapf: enable
def test_processing_correctness_phi3v(
model_id: str,
modalities: dict[str, bool],
hit_rate: float,
num_batches: int,
simplify_rate: float,
Expand All @@ -195,7 +196,6 @@ def test_processing_correctness_phi3v(

_test_processing_correctness(
model_id,
modalities,
hit_rate=hit_rate,
num_batches=num_batches,
simplify_rate=simplify_rate,
Expand Down
144 changes: 0 additions & 144 deletions tests/models/multimodal/processing/test_qwen.py

This file was deleted.

Loading

0 comments on commit 99d1b38

Please sign in to comment.