|
19 | 19 | correctess for the target model outputs.
|
20 | 20 | """
|
21 | 21 |
|
| 22 | +from unittest.mock import patch |
| 23 | + |
22 | 24 | import pytest
|
23 | 25 |
|
| 26 | +from vllm.model_executor.layers.vocab_parallel_embedding import pad_vocab_size |
| 27 | + |
24 | 28 | from .conftest import (run_equality_correctness_test,
|
25 | 29 | run_greedy_equality_correctness_test)
|
26 | 30 |
|
@@ -178,6 +182,62 @@ def test_mlp_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
|
178 | 182 | force_output_len=True)
|
179 | 183 |
|
180 | 184 |
|
| 185 | +@pytest.mark.parametrize( |
| 186 | + "common_llm_kwargs", |
| 187 | + [{ |
| 188 | + "block_size": 8, |
| 189 | + # 2 for small prompt, 256//8 for generated. |
| 190 | + "num_gpu_blocks_override": 2 + 256 // 8, |
| 191 | + "max_model_len": (2 + 256 // 8) * 8, |
| 192 | +
|
| 193 | + # Skip cuda graph recording for fast test. |
| 194 | + "enforce_eager": True, |
| 195 | +
|
| 196 | + # Required for spec decode. |
| 197 | + "use_v2_block_manager": True, |
| 198 | +
|
| 199 | + # Precision |
| 200 | + "dtype": PRECISION, |
| 201 | +
|
| 202 | + # Main model |
| 203 | + "model": MAIN_MODEL, |
| 204 | + }]) |
| 205 | +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) |
| 206 | +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) |
| 207 | +@pytest.mark.parametrize("test_llm_kwargs", [ |
| 208 | + { |
| 209 | + "speculative_model": SPEC_MODEL, |
| 210 | + }, |
| 211 | +]) |
| 212 | +@pytest.mark.parametrize( |
| 213 | + "output_len", |
| 214 | + [ |
| 215 | + # Use small output len for fast test. |
| 216 | + 128, |
| 217 | + ]) |
| 218 | +@pytest.mark.parametrize("batch_size", [4]) |
| 219 | +@pytest.mark.parametrize("seed", [1]) |
| 220 | +def test_mlp_e2e_greedy_correctness_with_padding(baseline_llm_generator, |
| 221 | + test_llm_generator, |
| 222 | + batch_size: int, |
| 223 | + output_len: int): |
| 224 | + """Verify greedy equality when the vocab dimension is padded |
| 225 | + """ |
| 226 | + |
| 227 | + # Default pad_to is 64, test model has vocab_size of 32000 |
| 228 | + def patched_pad_vocab_size(vocab_size, pad_to=None): |
| 229 | + return pad_vocab_size(vocab_size, pad_to=32064) |
| 230 | + |
| 231 | + with patch( |
| 232 | + "vllm.model_executor.layers.vocab_parallel_embedding.pad_vocab_size", |
| 233 | + patched_pad_vocab_size): |
| 234 | + run_greedy_equality_correctness_test(baseline_llm_generator, |
| 235 | + test_llm_generator, |
| 236 | + batch_size, |
| 237 | + max_output_len=output_len, |
| 238 | + force_output_len=True) |
| 239 | + |
| 240 | + |
181 | 241 | @pytest.mark.parametrize(
|
182 | 242 | "common_llm_kwargs",
|
183 | 243 | [{
|
|
0 commit comments