diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 21deec2bba973..45bc8eb2f8477 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -23,7 +23,7 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py" # Run basic model test docker exec cpu-test bash -c " pip install pytest Pillow protobuf - pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py" # Mamba on CPU is not supported + pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported # online inference docker exec cpu-test bash -c " diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 633bc5ca95bf9..be8807df0b098 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -140,14 +140,13 @@ steps: working_dir: "/vllm-workspace/examples" mirror_hardwares: [amd] commands: - # install aws cli for llava_example.py # install tensorizer for tensorize_vllm_model.py - pip install awscli tensorizer - python3 offline_inference.py - python3 cpu_offload.py - python3 offline_inference_with_prefix.py - python3 llm_engine_example.py - - python3 llava_example.py + - python3 offline_inference_vision_language.py - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - label: Inputs Test @@ -220,7 +219,6 @@ steps: - label: Tensorizer Test #mirror_hardwares: [amd] - soft_fail: true fast_check: true commands: - apt-get install -y curl libsodium23 diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 428e199088589..f1959ad2743f3 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -10,6 +10,7 @@ build: sphinx: configuration: docs/source/conf.py + fail_on_warning: true # If using Sphinx, optionally build your docs in additional formats such as PDF formats: diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 7b4c0166a04bd..64bc0f3c12c75 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -55,8 +55,8 @@ RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \ *"rocm-6.1"*) \ python3 -m pip uninstall -y torch torchvision \ && python3 -m pip install --no-cache-dir --pre \ - torch==2.5.0.dev20240710 \ - torchvision==0.20.0.dev20240710 \ + torch==2.5.0.dev20240726 \ + torchvision==0.20.0.dev20240726 \ --index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \ *) ;; esac diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index 78cac8a555d1b..a04433142da42 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -175,7 +175,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: parser.add_argument("--num-kv-heads", type=int, default=8) parser.add_argument("--head-size", type=int, - choices=[64, 80, 96, 112, 128, 192, 256], + choices=[64, 80, 96, 112, 120, 128, 192, 256], default=128) parser.add_argument("--block-size", type=int, choices=[16, 32], default=16) parser.add_argument("--use-alibi", action="store_true") diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index 78736c7a7ba6f..f542684a9a2a9 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -94,7 +94,7 @@ def benchmark_rope_kernels_multi_lora( parser.add_argument("--num-heads", type=int, default=8) parser.add_argument("--head-size", type=int, - choices=[64, 80, 96, 112, 128, 192, 256], + choices=[64, 80, 96, 112, 120, 128, 192, 256], default=128) parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32) parser.add_argument("--dtype", diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu index 350dbce1d7ba9..875570a1e894f 100644 --- a/csrc/attention/attention_kernels.cu +++ b/csrc/attention/attention_kernels.cu @@ -751,6 +751,9 @@ void paged_attention_v1_launcher( case 112: LAUNCH_PAGED_ATTENTION_V1(112); break; + case 120: + LAUNCH_PAGED_ATTENTION_V1(120); + break; case 128: LAUNCH_PAGED_ATTENTION_V1(128); break; @@ -912,6 +915,9 @@ void paged_attention_v2_launcher( case 112: LAUNCH_PAGED_ATTENTION_V2(112); break; + case 120: + LAUNCH_PAGED_ATTENTION_V2(120); + break; case 128: LAUNCH_PAGED_ATTENTION_V2(128); break; diff --git a/docs/source/conf.py b/docs/source/conf.py index f4cec05663fcd..b867bfd89dc17 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -94,6 +94,7 @@ def setup(app): # Mock out external dependencies here, otherwise the autodoc pages may be blank. autodoc_mock_imports = [ + "aiohttp", "cpuinfo", "torch", "transformers", @@ -141,5 +142,6 @@ def add_line(self, line: str, source: str, *lineno: int) -> None: } autodoc_preserve_defaults = True +autodoc_warningiserror = True navigation_with_keys = False diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst index 1c7d274b7c47e..9648d07d2790c 100644 --- a/docs/source/getting_started/amd-installation.rst +++ b/docs/source/getting_started/amd-installation.rst @@ -117,7 +117,7 @@ Alternatively, wheels intended for vLLM use can be accessed under the releases. $ # Install PyTorch $ pip uninstall torch -y - $ pip install --no-cache-dir --pre torch==2.5.0.dev20240710 --index-url https://download.pytorch.org/whl/nightly/rocm6.1 + $ pip install --no-cache-dir --pre torch==2.5.0.dev20240726 --index-url https://download.pytorch.org/whl/nightly/rocm6.1 $ # Build & install AMD SMI $ pip install /opt/rocm/share/amd_smi diff --git a/docs/source/getting_started/neuron-installation.rst b/docs/source/getting_started/neuron-installation.rst index 62bf779c339d5..0816524468cab 100644 --- a/docs/source/getting_started/neuron-installation.rst +++ b/docs/source/getting_started/neuron-installation.rst @@ -131,6 +131,6 @@ Once neuronx-cc and transformers-neuronx packages are installed, we will be able $ git clone https://github.com/vllm-project/vllm.git $ cd vllm $ pip install -U -r requirements-neuron.txt - $ pip install . + $ VLLM_TARGET_DEVICE="neuron" pip install . If neuron packages are detected correctly in the installation process, ``vllm-0.3.0+neuron212`` will be installed. diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 483f552bba238..83c1b9c8bce86 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -7,6 +7,8 @@ vLLM supports a variety of generative Transformer models in `HuggingFace Transfo The following is the list of model architectures that are currently supported by vLLM. Alongside each architecture, we include some popular models that use it. +---- + Decoder-only Language Models ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. list-table:: @@ -186,6 +188,10 @@ Vision Language Models - Models - Example HuggingFace Models - :ref:`LoRA ` + * - :code:`Blip2ForConditionalGeneration` + - BLIP-2 + - :code:`Salesforce/blip2-opt-2.7b`, :code:`Salesforce/blip2-opt-6.7b`, etc. + - * - :code:`ChameleonForConditionalGeneration` - Chameleon - :code:`facebook/chameleon-7b` etc. @@ -215,6 +221,8 @@ Vision Language Models - :code:`openbmb/MiniCPM-V-2`, :code:`openbmb/MiniCPM-Llama3-V-2_5`, etc. - +---- + If your model uses one of the above model architectures, you can seamlessly run your model with vLLM. Otherwise, please refer to :ref:`Adding a New Model ` and :ref:`Enabling Multimodal Inputs ` for instructions on how to implement support for your model. diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index ef4ce0d44a162..a385605c9f8f6 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -73,7 +73,7 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptI generated_text = o.outputs[0].text print(generated_text) -A code example can be found in `examples/llava_example.py `_. +A code example can be found in `examples/offline_inference_vision_language.py `_. Online OpenAI Vision API Compatible Inference diff --git a/examples/api_client.py b/examples/api_client.py index 27a2a08b7b0c3..49a085febdc57 100644 --- a/examples/api_client.py +++ b/examples/api_client.py @@ -31,7 +31,10 @@ def post_http_request(prompt: str, "max_tokens": 16, "stream": stream, } - response = requests.post(api_url, headers=headers, json=pload, stream=True) + response = requests.post(api_url, + headers=headers, + json=pload, + stream=stream) return response diff --git a/examples/fuyu_example.py b/examples/fuyu_example.py deleted file mode 100644 index c92b8fb4bc286..0000000000000 --- a/examples/fuyu_example.py +++ /dev/null @@ -1,31 +0,0 @@ -import requests -from PIL import Image - -from vllm import LLM, SamplingParams - - -def run_fuyu(): - llm = LLM(model="adept/fuyu-8b", max_model_len=4096) - - # single-image prompt - prompt = "What is the highest life expectancy at of male?\n" - url = "https://huggingface.co/adept/fuyu-8b/resolve/main/chart.png" - image = Image.open(requests.get(url, stream=True).raw) - sampling_params = SamplingParams(temperature=0, max_tokens=64) - - outputs = llm.generate( - { - "prompt": prompt, - "multi_modal_data": { - "image": image - }, - }, - sampling_params=sampling_params) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - - -if __name__ == "__main__": - run_fuyu() diff --git a/examples/llava_example.py b/examples/llava_example.py deleted file mode 100644 index 4c9eabd261e5c..0000000000000 --- a/examples/llava_example.py +++ /dev/null @@ -1,25 +0,0 @@ -from vllm import LLM -from vllm.assets.image import ImageAsset - - -def run_llava(): - llm = LLM(model="llava-hf/llava-1.5-7b-hf") - - prompt = "USER: \nWhat is the content of this image?\nASSISTANT:" - - image = ImageAsset("stop_sign").pil_image - - outputs = llm.generate({ - "prompt": prompt, - "multi_modal_data": { - "image": image - }, - }) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - - -if __name__ == "__main__": - run_llava() diff --git a/examples/llava_next_example.py b/examples/llava_next_example.py deleted file mode 100644 index fd53a6def1a13..0000000000000 --- a/examples/llava_next_example.py +++ /dev/null @@ -1,36 +0,0 @@ -from io import BytesIO - -import requests -from PIL import Image - -from vllm import LLM, SamplingParams - - -def run_llava_next(): - llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=4096) - - prompt = "[INST] \nWhat is shown in this image? [/INST]" - url = "https://h2o-release.s3.amazonaws.com/h2ogpt/bigben.jpg" - image = Image.open(BytesIO(requests.get(url).content)) - sampling_params = SamplingParams(temperature=0.8, - top_p=0.95, - max_tokens=100) - - outputs = llm.generate( - { - "prompt": prompt, - "multi_modal_data": { - "image": image - } - }, - sampling_params=sampling_params) - - generated_text = "" - for o in outputs: - generated_text += o.outputs[0].text - - print(f"LLM output:{generated_text}") - - -if __name__ == "__main__": - run_llava_next() diff --git a/examples/minicpmv_example.py b/examples/minicpmv_example.py deleted file mode 100644 index bf20a7ea04ad4..0000000000000 --- a/examples/minicpmv_example.py +++ /dev/null @@ -1,55 +0,0 @@ -from transformers import AutoTokenizer - -from vllm import LLM, SamplingParams -from vllm.assets.image import ImageAsset - -# 2.0 -# The official repo doesn't work yet, so we need to use a fork for now -# For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 -# MODEL_NAME = "HwwwH/MiniCPM-V-2" -# 2.5 -MODEL_NAME = "openbmb/MiniCPM-Llama3-V-2_5" - -image = ImageAsset("stop_sign").pil_image.convert("RGB") - -tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) -llm = LLM(model=MODEL_NAME, - gpu_memory_utilization=1, - trust_remote_code=True, - max_model_len=4096) - -messages = [{ - 'role': - 'user', - 'content': - '(./)\n' + "What's the content of the image?" -}] -prompt = tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) -# 2.0 -# stop_token_ids = [tokenizer.eos_id] -# 2.5 -stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id] - -sampling_params = SamplingParams( - stop_token_ids=stop_token_ids, - # temperature=0.7, - # top_p=0.8, - # top_k=100, - # seed=3472, - max_tokens=1024, - # min_tokens=150, - temperature=0, - use_beam_search=True, - # length_penalty=1.2, - best_of=3) - -outputs = llm.generate({ - "prompt": prompt, - "multi_modal_data": { - "image": image - } -}, - sampling_params=sampling_params) -print(outputs[0].outputs[0].text) diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py new file mode 100644 index 0000000000000..04ba1a96314c9 --- /dev/null +++ b/examples/offline_inference_vision_language.py @@ -0,0 +1,184 @@ +""" +This example shows how to use vLLM for running offline inference +with the correct prompt format on vision language models. + +For most models, the prompt format should follow corresponding examples +on HuggingFace model repository. +""" +from transformers import AutoTokenizer + +from vllm import LLM, SamplingParams +from vllm.assets.image import ImageAsset +from vllm.utils import FlexibleArgumentParser + +# Input image and question +image = ImageAsset("cherry_blossom").pil_image.convert("RGB") +question = "What is the content of this image?" + + +# LLaVA-1.5 +def run_llava(question): + + prompt = f"USER: \n{question}\nASSISTANT:" + + llm = LLM(model="llava-hf/llava-1.5-7b-hf") + + return llm, prompt + + +# LLaVA-1.6/LLaVA-NeXT +def run_llava_next(question): + + prompt = f"[INST] \n{question} [/INST]" + llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf") + + return llm, prompt + + +# Fuyu +def run_fuyu(question): + + prompt = f"{question}\n" + llm = LLM(model="adept/fuyu-8b") + + return llm, prompt + + +# Phi-3-Vision +def run_phi3v(question): + + prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n" # noqa: E501 + # Note: The default setting of max_num_seqs (256) and + # max_model_len (128k) for this model may cause OOM. + # You may lower either to run this example on lower-end GPUs. + + # In this example, we override max_num_seqs to 5 while + # keeping the original context length of 128k. + llm = LLM( + model="microsoft/Phi-3-vision-128k-instruct", + trust_remote_code=True, + max_num_seqs=5, + ) + return llm, prompt + + +# PaliGemma +def run_paligemma(question): + + prompt = question + llm = LLM(model="google/paligemma-3b-mix-224") + + return llm, prompt + + +# Chameleon +def run_chameleon(question): + + prompt = f"{question}" + llm = LLM(model="facebook/chameleon-7b") + return llm, prompt + + +# MiniCPM-V +def run_minicpmv(question): + + # 2.0 + # The official repo doesn't work yet, so we need to use a fork for now + # For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 # noqa + # model_name = "HwwwH/MiniCPM-V-2" + + # 2.5 + model_name = "openbmb/MiniCPM-Llama3-V-2_5" + tokenizer = AutoTokenizer.from_pretrained(model_name, + trust_remote_code=True) + llm = LLM( + model=model_name, + trust_remote_code=True, + ) + + messages = [{ + 'role': 'user', + 'content': f'(./)\n{question}' + }] + prompt = tokenizer.apply_chat_template(messages, + tokenize=False, + add_generation_prompt=True) + return llm, prompt + + +# BLIP-2 +def run_blip2(question): + + # BLIP-2 prompt format is inaccurate on HuggingFace model repository. + # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa + prompt = f"Question: {question} Answer:" + llm = LLM(model="Salesforce/blip2-opt-2.7b") + return llm, prompt + + +model_example_map = { + "llava": run_llava, + "llava-next": run_llava_next, + "fuyu": run_fuyu, + "phi3_v": run_phi3v, + "paligemma": run_paligemma, + "chameleon": run_chameleon, + "minicpmv": run_minicpmv, + "blip-2": run_blip2, +} + + +def main(args): + model = args.model_type + if model not in model_example_map: + raise ValueError(f"Model type {model} is not supported.") + + llm, prompt = model_example_map[model](question) + + # We set temperature to 0.2 so that outputs can be different + # even when all prompts are identical when running batch inference. + sampling_params = SamplingParams(temperature=0.2, max_tokens=64) + + assert args.num_prompts > 0 + if args.num_prompts == 1: + # Single inference + inputs = { + "prompt": prompt, + "multi_modal_data": { + "image": image + }, + } + + else: + # Batch inference + inputs = [{ + "prompt": prompt, + "multi_modal_data": { + "image": image + }, + } for _ in range(args.num_prompts)] + + outputs = llm.generate(inputs, sampling_params=sampling_params) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description='Demo on using vLLM for offline inference with ' + 'vision language models') + parser.add_argument('--model-type', + '-m', + type=str, + default="llava", + choices=model_example_map.keys(), + help='Huggingface "model_type".') + parser.add_argument('--num-prompts', + type=int, + default=1, + help='Number of prompts to run.') + + args = parser.parse_args() + main(args) diff --git a/examples/paligemma_example.py b/examples/paligemma_example.py deleted file mode 100644 index 92a3cb3ac4129..0000000000000 --- a/examples/paligemma_example.py +++ /dev/null @@ -1,25 +0,0 @@ -from vllm import LLM -from vllm.assets.image import ImageAsset - - -def run_paligemma(): - llm = LLM(model="google/paligemma-3b-mix-224") - - prompt = "caption es" - - image = ImageAsset("stop_sign").pil_image - - outputs = llm.generate({ - "prompt": prompt, - "multi_modal_data": { - "image": image - }, - }) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - - -if __name__ == "__main__": - run_paligemma() diff --git a/examples/phi3v_example.py b/examples/phi3v_example.py deleted file mode 100644 index ae8c38d84e8fd..0000000000000 --- a/examples/phi3v_example.py +++ /dev/null @@ -1,40 +0,0 @@ -from vllm import LLM, SamplingParams -from vllm.assets.image import ImageAsset - - -def run_phi3v(): - model_path = "microsoft/Phi-3-vision-128k-instruct" - - # Note: The default setting of max_num_seqs (256) and - # max_model_len (128k) for this model may cause OOM. - # You may lower either to run this example on lower-end GPUs. - - # In this example, we override max_num_seqs to 5 while - # keeping the original context length of 128k. - llm = LLM( - model=model_path, - trust_remote_code=True, - max_num_seqs=5, - ) - - image = ImageAsset("cherry_blossom").pil_image - - # single-image prompt - prompt = "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n" # noqa: E501 - sampling_params = SamplingParams(temperature=0, max_tokens=64) - - outputs = llm.generate( - { - "prompt": prompt, - "multi_modal_data": { - "image": image - }, - }, - sampling_params=sampling_params) - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - - -if __name__ == "__main__": - run_phi3v() diff --git a/examples/template_blip2.jinja b/examples/template_blip2.jinja new file mode 100644 index 0000000000000..fd41a7f7fa666 --- /dev/null +++ b/examples/template_blip2.jinja @@ -0,0 +1,11 @@ +{%- for message in messages -%} + {%- if message['role'] == 'user' -%} + {{- 'Question: ' + message['content'] + ' ' -}} + {%- elif message['role'] == 'assistant' -%} + {{- 'Answer: ' + message['content'] + ' ' -}} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {{- 'Answer:' -}} +{% endif %} diff --git a/requirements-tpu.txt b/requirements-tpu.txt index 22487f5524dd7..c2140fbffec9f 100644 --- a/requirements-tpu.txt +++ b/requirements-tpu.txt @@ -4,4 +4,5 @@ # Dependencies for TPU # Currently, the TPU backend uses a nightly version of PyTorch XLA. # You can install the dependencies in Dockerfile.tpu. +ray triton # To avoid import errors diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index 2e6412c28958e..c7c6707461c3e 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -28,7 +28,7 @@ # FlashAttention forward only supports head dimension at most 128 # https://github.com/ROCmSoftwarePlatform/flash-attention/blob/3d2b6f5d037782cc2c906909a46fb7e2e1b48b25/csrc/flash_attn_rocm/flash_api.cpp#L62 -HEAD_SIZES = [64, 80, 96, 112, 128, 192, 256 +HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256 ] if not is_hip() else [64, 80, 96, 112, 128] BLOCK_SIZES = [16, 32] @@ -134,6 +134,8 @@ def test_paged_attention( seed: int, device: str, ) -> None: + if kv_cache_dtype == "fp8" and head_size % 16: + pytest.skip() random.seed(seed) torch.random.manual_seed(seed) if torch.cuda.is_available(): diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index f9a609464abfc..3fb9b59be1701 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -11,7 +11,7 @@ NUM_TOKENS = [42] # Arbitrary values for testing NUM_LAYERS = [1] # Arbitrary values for testing NUM_HEADS = [8] # Arbitrary values for testing -HEAD_SIZES = [64, 80, 96, 112, 128, 192, 256] +HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256] BLOCK_SIZES = [8, 16, 32] # Arbitrary values for testing @@ -52,6 +52,8 @@ def test_copy_blocks( kv_cache_dtype: str, device: str, ) -> None: + if kv_cache_dtype == "fp8" and head_size % 16: + pytest.skip() random.seed(seed) torch.random.manual_seed(seed) if torch.cuda.is_available(): @@ -124,6 +126,8 @@ def test_reshape_and_cache( device: str, kv_cache_dtype: str, ) -> None: + if kv_cache_dtype == "fp8" and head_size % 16: + pytest.skip() random.seed(seed) torch.random.manual_seed(seed) if torch.cuda.is_available(): @@ -325,6 +329,8 @@ def test_swap_blocks( ) -> None: if kv_cache_dtype == "fp8" and "cpu" in direction: pytest.skip() + if kv_cache_dtype == "fp8" and head_size % 16: + pytest.skip() random.seed(seed) torch.random.manual_seed(seed) if torch.cuda.is_available(): diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index 4c83659929d41..4a7ad6e0fa21d 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -10,7 +10,7 @@ IS_NEOX_STYLE = [True, False] DTYPES = [torch.half, torch.bfloat16, torch.float] -HEAD_SIZES = [64, 80, 96, 112, 128, 192, 256] +HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256] ROTARY_DIMS = [None, 32] # None means rotary dim == head size NUM_HEADS = [7, 17] # Arbitrary values for testing BATCH_SIZES = [1, 5] # Arbitrary values for testing diff --git a/tests/models/test_blip2.py b/tests/models/test_blip2.py new file mode 100644 index 0000000000000..26afd57ae6106 --- /dev/null +++ b/tests/models/test_blip2.py @@ -0,0 +1,102 @@ +from typing import List, Optional, Tuple + +import pytest +from transformers import AutoTokenizer + +from vllm.multimodal.utils import rescale_image_size +from vllm.sequence import SampleLogprobs + +from ..conftest import IMAGE_ASSETS +from .utils import check_logprobs_close + +pytestmark = pytest.mark.vlm + +HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ + "stop_sign": + "Question: What's the content of the image? Answer:", + "cherry_blossom": + "Question: What is the season? Answer:", +}) + + +def vllm_to_hf_output(vllm_output: Tuple[List[int], str, + Optional[SampleLogprobs]], + model: str): + """Sanitize vllm output to be comparable with hf output.""" + _, output_str, out_logprobs = vllm_output + + hf_output_str = output_str + "\n" + + tokenizer = AutoTokenizer.from_pretrained(model) + hf_output_ids = tokenizer.encode(hf_output_str) + assert hf_output_ids[0] == tokenizer.bos_token_id + hf_output_ids = hf_output_ids[1:] + + return hf_output_ids, hf_output_str, out_logprobs + + +@pytest.mark.parametrize("model", ["Salesforce/blip2-opt-2.7b"]) +@pytest.mark.parametrize( + "size_factors", + [ + # No image + [], + # Single-scale + [1.0], + # Single-scale, batched + [1.0, 1.0, 1.0], + # Multi-scale + [0.25, 0.5, 1.0], + ], +) +@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("max_tokens", [128]) +@pytest.mark.parametrize("num_logprobs", [5]) +def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, + dtype: str, max_tokens: int, num_logprobs: int) -> None: + """Inference result should be the same between hf and vllm. + + All the image fixtures for the test is under tests/images. + For huggingface runner, we provide the PIL images as input. + For vllm runner, we provide MultiModalData objects and corresponding + vision language config as input. + Note, the text input is also adjusted to abide by vllm contract. + The text output is sanitized to be able to compare with hf. + """ + images = [asset.pil_image for asset in image_assets] + + inputs_per_image = [( + [prompt for _ in size_factors], + [rescale_image_size(image, factor) for factor in size_factors], + ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] + + # max_model_len should be greater than image_feature_size + with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: + vllm_outputs_per_image = [ + vllm_model.generate_greedy_logprobs(prompts, + max_tokens, + num_logprobs=num_logprobs, + images=images) + for prompts, images in inputs_per_image + ] + + with hf_runner(model, dtype=dtype, is_vision_model=True) as hf_model: + hf_outputs_per_image = [ + hf_model.generate_greedy_logprobs_limit(prompts, + max_tokens, + num_logprobs=num_logprobs, + images=images) + for prompts, images in inputs_per_image + ] + + for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, + vllm_outputs_per_image): + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=[ + vllm_to_hf_output(vllm_output, model) + for vllm_output in vllm_outputs + ], + name_0="hf", + name_1="vllm", + ) diff --git a/tests/models/test_danube3_4b.py b/tests/models/test_danube3_4b.py new file mode 100644 index 0000000000000..bfaa275f73c19 --- /dev/null +++ b/tests/models/test_danube3_4b.py @@ -0,0 +1,52 @@ +"""Compare the outputs of HF and vLLM when using greedy sampling. + +This tests danube3 separately because its head size isn't supported on CPU yet. + +Run `pytest tests/models/test_danube3_4b.py`. +""" +import pytest + +from .utils import check_outputs_equal + +MODELS = ["h2oai/h2o-danube3-4b-base"] + +target_dtype = "half" + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", [target_dtype]) +@pytest.mark.parametrize("max_tokens", [32]) +def test_models( + hf_runner, + vllm_runner, + example_prompts, + model: str, + dtype: str, + max_tokens: int, +) -> None: + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) + + with vllm_runner(model, dtype=dtype) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + + check_outputs_equal( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", [target_dtype]) +def test_model_print( + vllm_runner, + model: str, + dtype: str, +) -> None: + with vllm_runner(model, dtype=dtype) as vllm_model: + # This test is for verifying whether the model's extra_repr + # can be printed correctly. + print(vllm_model.model.llm_engine.model_executor.driver_worker. + model_runner.model) diff --git a/tests/models/test_fuyu.py b/tests/models/test_fuyu.py index 25f63a3d64d0e..7d0f3be5ea008 100644 --- a/tests/models/test_fuyu.py +++ b/tests/models/test_fuyu.py @@ -77,8 +77,8 @@ def run_test( vllm_model.generate_greedy_logprobs(prompts, max_tokens, num_logprobs=num_logprobs, - images=vllm_images) - for prompts, vllm_images in inputs_per_image + images=images) + for prompts, images in inputs_per_image ] with hf_runner(model, dtype=dtype) as hf_model: @@ -89,9 +89,9 @@ def run_test( hf_model.generate_greedy_logprobs_limit(prompts, max_tokens, num_logprobs=num_logprobs, - images=hf_images, + images=images, eos_token_id=eos_token_id) - for prompts, hf_images in inputs_per_image + for prompts, images in inputs_per_image ] for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, diff --git a/tests/models/test_minicpmv.py b/tests/models/test_minicpmv.py index 9124fa7a6238c..c57f0f8c08548 100644 --- a/tests/models/test_minicpmv.py +++ b/tests/models/test_minicpmv.py @@ -88,9 +88,9 @@ def run_test( vllm_model.generate_greedy_logprobs(prompts, max_tokens, num_logprobs=num_logprobs, - images=vllm_images, + images=images, stop_token_ids=stop_token_ids) - for prompts, vllm_images in inputs_per_image + for prompts, images in inputs_per_image ] with hf_runner(model, dtype=dtype) as hf_model, torch.no_grad(): @@ -114,9 +114,9 @@ def to(self, device: torch.types.Device): hf_model.generate_greedy_logprobs_limit(prompts, max_tokens, num_logprobs=num_logprobs, - images=hf_images, + images=images, tokenizer=tokenizer) - for prompts, hf_images in inputs_per_image + for prompts, images in inputs_per_image ] for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py index 9da25ab8d78fe..35ffe4ef50a85 100644 --- a/tests/models/test_phi3v.py +++ b/tests/models/test_phi3v.py @@ -101,8 +101,8 @@ def run_test( vllm_model.generate_greedy_logprobs(prompts, max_tokens, num_logprobs=num_logprobs, - images=vllm_images) - for prompts, vllm_images in inputs_per_image + images=images) + for prompts, images in inputs_per_image ] # use eager mode for hf runner, since phi3_v didn't work with flash_attn @@ -114,9 +114,9 @@ def run_test( hf_model.generate_greedy_logprobs_limit(prompts, max_tokens, num_logprobs=num_logprobs, - images=hf_images, + images=images, eos_token_id=eos_token_id) - for prompts, hf_images in inputs_per_image + for prompts, images in inputs_per_image ] for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, diff --git a/tests/tensorizer_loader/conftest.py b/tests/tensorizer_loader/conftest.py new file mode 100644 index 0000000000000..c5c6fc1057d31 --- /dev/null +++ b/tests/tensorizer_loader/conftest.py @@ -0,0 +1,45 @@ +# isort: skip_file + +import contextlib +import gc + +import pytest +import ray +import torch + +from vllm.distributed import (destroy_distributed_environment, + destroy_model_parallel) +from vllm.model_executor.model_loader.tensorizer import TensorizerConfig + + +def cleanup(): + destroy_model_parallel() + destroy_distributed_environment() + with contextlib.suppress(AssertionError): + torch.distributed.destroy_process_group() + gc.collect() + torch.cuda.empty_cache() + ray.shutdown() + + +@pytest.fixture() +def should_do_global_cleanup_after_test(request) -> bool: + """Allow subdirectories to skip global cleanup by overriding this fixture. + This can provide a ~10x speedup for non-GPU unit tests since they don't need + to initialize torch. + """ + + return True + + +@pytest.fixture(autouse=True) +def cleanup_fixture(should_do_global_cleanup_after_test: bool): + yield + if should_do_global_cleanup_after_test: + cleanup() + + +@pytest.fixture(autouse=True) +def tensorizer_config(): + config = TensorizerConfig(tensorizer_uri="vllm") + return config \ No newline at end of file diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index b7030e3cd6d42..2adeae8874bdb 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -40,7 +40,6 @@ tensorize_model_for_testing_script = os.path.join( os.path.dirname(__file__), "tensorize_vllm_model_for_testing.py") - def is_curl_installed(): try: subprocess.check_call(['curl', '--version']) @@ -63,10 +62,6 @@ def write_keyfile(keyfile_path: str): with open(keyfile_path, 'wb') as f: f.write(encryption_params.key) -@pytest.fixture(autouse=True) -def tensorizer_config(): - config = TensorizerConfig(tensorizer_uri="vllm") - return config @patch('vllm.model_executor.model_loader.tensorizer.TensorizerAgent') @@ -105,6 +100,7 @@ def test_can_deserialize_s3(vllm_runner): @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed") def test_deserialized_encrypted_vllm_model_has_same_outputs( vllm_runner, tmp_path): + cleanup() with vllm_runner(model_ref) as vllm_model: model_path = tmp_path / (model_ref + ".tensors") key_path = tmp_path / (model_ref + ".key") @@ -316,6 +312,7 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner, def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path): + cleanup() model_ref = "facebook/opt-125m" model_path = tmp_path / (model_ref + ".tensors") config = TensorizerConfig(tensorizer_uri=str(model_path)) diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py index b83a83bb177d4..c53a2f91b89d7 100644 --- a/vllm/attention/backends/pallas.py +++ b/vllm/attention/backends/pallas.py @@ -55,8 +55,8 @@ class PallasMetadata(AttentionMetadata): # Currently, input sequences can only contain all prefills # or all decoding. - block_tables: Optional[torch.Tensor] - context_lens: Optional[torch.Tensor] + block_tables: Optional[torch.Tensor] = None + context_lens: Optional[torch.Tensor] = None @property def prefill_metadata(self) -> Optional["PallasMetadata"]: diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py index ce7b4d129779c..0f6d2f2d1ab3f 100644 --- a/vllm/attention/ops/paged_attn.py +++ b/vllm/attention/ops/paged_attn.py @@ -31,7 +31,7 @@ class PagedAttention: @staticmethod def get_supported_head_sizes() -> List[int]: - return [64, 80, 96, 112, 128, 192, 256] + return [64, 80, 96, 112, 120, 128, 192, 256] @staticmethod def get_kv_cache_shape( diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index cd64d3345b830..bad5be4917216 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -754,10 +754,14 @@ def create_engine_config(self, ) -> EngineConfig: use_sliding_window = (model_config.get_sliding_window() is not None) use_spec_decode = self.speculative_model is not None + has_seqlen_agnostic_layers = ( + model_config.contains_seqlen_agnostic_layers( + parallel_config)) if (is_gpu and not use_sliding_window and not use_spec_decode and not self.enable_lora and not self.enable_prompt_adapter - and not self.enable_prefix_caching): + and not self.enable_prefix_caching + and not has_seqlen_agnostic_layers): self.enable_chunked_prefill = True logger.warning( "Chunked prefill is enabled by default for models with " diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 614cf458c470b..1efe2206abe81 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -393,8 +393,14 @@ def _get_executor_cls(cls, from vllm.executor.neuron_executor import NeuronExecutor executor_class = NeuronExecutor elif engine_config.device_config.device_type == "tpu": - from vllm.executor.tpu_executor import TPUExecutor - executor_class = TPUExecutor + if distributed_executor_backend == "ray": + initialize_ray_cluster(engine_config.parallel_config) + from vllm.executor.ray_tpu_executor import RayTPUExecutor + executor_class = RayTPUExecutor + else: + assert distributed_executor_backend is None + from vllm.executor.tpu_executor import TPUExecutor + executor_class = TPUExecutor elif engine_config.device_config.device_type == "cpu": from vllm.executor.cpu_executor import CPUExecutor executor_class = CPUExecutor diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py index 19f7a497cdd9f..e1e92958e667c 100644 --- a/vllm/executor/multiproc_gpu_executor.py +++ b/vllm/executor/multiproc_gpu_executor.py @@ -6,6 +6,8 @@ from functools import partial from typing import Any, List, Optional +import torch + from vllm.executor.distributed_gpu_executor import ( # yapf: disable DistributedGPUExecutor, DistributedGPUExecutorAsync) from vllm.executor.gpu_executor import create_worker @@ -45,10 +47,23 @@ def _init_executor(self) -> None: # Disable torch async compiling which won't work with daemonic processes os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1" - # Set OMP_NUM_THREADS to 1 if it is not set explicitly, avoids CPU - # contention amongst the shards - if "OMP_NUM_THREADS" not in os.environ: - os.environ["OMP_NUM_THREADS"] = "1" + # Configure thread parallelism if OMP_NUM_THREADS isn't set + # + # Helps to avoid CPU contention. The default of spawning a thread per + # core combined with multiprocessing for each GPU can have a negative + # impact on performance. The contention is amplified when running in a + # container where CPU limits can cause throttling. + default_omp_num_threads = 1 + if "OMP_NUM_THREADS" not in os.environ and ( + current_parallelism := + torch.get_num_threads()) > default_omp_num_threads: + logger.warning( + "Reducing Torch parallelism from %d threads to %d to avoid " + "unnecessary CPU contention. Set OMP_NUM_THREADS in the " + "external environment to tune this value as needed.", + current_parallelism, default_omp_num_threads) + os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads) + torch.set_num_threads(default_omp_num_threads) # workaround for https://github.com/vllm-project/vllm/issues/6103 if world_size > 1: diff --git a/vllm/executor/ray_tpu_executor.py b/vllm/executor/ray_tpu_executor.py new file mode 100644 index 0000000000000..7048d47980723 --- /dev/null +++ b/vllm/executor/ray_tpu_executor.py @@ -0,0 +1,313 @@ +import asyncio +import os +from collections import defaultdict +from itertools import islice, repeat +from typing import (TYPE_CHECKING, Any, Awaitable, Dict, List, Optional, Tuple, + Union) + +import vllm.envs as envs +from vllm.executor.executor_base import ExecutorAsyncBase +from vllm.executor.ray_utils import RayWorkerWrapper, ray +from vllm.executor.tpu_executor import TPUExecutor +from vllm.logger import init_logger +from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, + get_vllm_instance_id, make_async) + +if ray is not None: + from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy + +if TYPE_CHECKING: + from ray.util.placement_group import PlacementGroup + +logger = init_logger(__name__) + + +class RayTPUExecutor(TPUExecutor): + + def __init__(self, *args, **kwargs): + # This is non-None when the execute model loop is running + # in the parallel workers. It's a coroutine in the AsyncLLMEngine case. + self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None + # Updated by implementations that require additional args to be passed + # to the _run_workers execute_model call + self.extra_execute_model_run_workers_kwargs: Dict[str, Any] = {} + + super().__init__(*args, **kwargs) + + def _init_executor(self) -> None: + assert self.parallel_config.distributed_executor_backend == "ray" + placement_group = self.parallel_config.placement_group + + # Disable Ray usage stats collection. + ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0") + if ray_usage != "1": + os.environ["RAY_USAGE_STATS_ENABLED"] = "0" + + # Create the parallel TPU workers. + self._init_workers_ray(placement_group) + + def _init_workers_ray(self, placement_group: "PlacementGroup", + **ray_remote_kwargs): + # The driver dummy worker does not actually use any resources. + # It holds the resource for the driver worker. + self.driver_dummy_worker: Optional[RayWorkerWrapper] = None + # The remaining workers are the actual ray actors. + self.workers: List[RayWorkerWrapper] = [] + + # Create the workers. + driver_ip = get_ip() + for bundle_id, bundle in enumerate(placement_group.bundle_specs): + if not bundle.get("TPU", 0): + continue + scheduling_strategy = PlacementGroupSchedulingStrategy( + placement_group=placement_group, + placement_group_capture_child_tasks=True, + placement_group_bundle_index=bundle_id, + ) + + assert self.speculative_config is None + worker_module_name = "vllm.worker.tpu_worker" + worker_class_name = "TPUWorker" + + worker = ray.remote( + num_cpus=0, + resources={"TPU": 1}, + scheduling_strategy=scheduling_strategy, + **ray_remote_kwargs, + )(RayWorkerWrapper).remote( + worker_module_name=worker_module_name, + worker_class_name=worker_class_name, + trust_remote_code=self.model_config.trust_remote_code, + ) + + worker_ip = ray.get(worker.get_node_ip.remote()) + if worker_ip == driver_ip and self.driver_dummy_worker is None: + # If the worker is on the same node as the driver, we use it + # as the resource holder for the driver process. + self.driver_dummy_worker = worker + self.driver_worker = RayWorkerWrapper( + worker_module_name=worker_module_name, + worker_class_name=worker_class_name, + trust_remote_code=self.model_config.trust_remote_code, + ) + else: + # Else, added to the list of workers. + self.workers.append(worker) + + if self.driver_dummy_worker is None: + raise ValueError( + "Ray does not allocate any TPUs on the driver node. Consider " + "adjusting the Ray placement group or running the driver on a " + "TPU node.") + + # Get the set of TPU IDs used on each node. + worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids", + use_dummy_driver=True) + + node_workers = defaultdict(list) + for i, (node_id, _) in enumerate(worker_node_and_gpu_ids): + node_workers[node_id].append(i) + + VLLM_INSTANCE_ID = get_vllm_instance_id() + + # Set environment variables for the driver and workers. + all_args_to_update_environment_variables = [({ + "VLLM_INSTANCE_ID": + VLLM_INSTANCE_ID, + "VLLM_TRACE_FUNCTION": + str(envs.VLLM_TRACE_FUNCTION), + }, ) for _ in worker_node_and_gpu_ids] + self._run_workers("update_environment_variables", + all_args=all_args_to_update_environment_variables) + + if len(node_workers) == 1: + # in single node case, we don't need to get the IP address. + # the loopback address is sufficient + # NOTE: a node may have several IP addresses, one for each + # network interface. `get_ip()` might return any of them, + # while they might not work for communication inside the node + # if the network setup is complicated. Using the loopback address + # solves this issue, as it always works for communication inside + # the node. + driver_ip = "127.0.0.1" + distributed_init_method = get_distributed_init_method( + driver_ip, get_open_port()) + + # Initialize the actual workers inside worker wrapper. + init_worker_all_kwargs = [ + self._get_worker_kwargs( + local_rank=node_workers[node_id].index(rank), + rank=rank, + distributed_init_method=distributed_init_method, + ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids) + ] + self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs) + + self._run_workers("init_device") + self._run_workers("load_model", + max_concurrent_workers=self.parallel_config. + max_parallel_loading_workers) + + def _driver_execute_model( + self, + execute_model_req: Optional[ExecuteModelRequest] = None + ) -> List[SamplerOutput]: + """Run execute_model in the driver worker. + + Passing None will cause the driver to stop the model execution + loop running in each of the remote workers. + """ + return self.driver_worker.execute_method("execute_model", + execute_model_req) + + def _run_workers( + self, + method: str, + *args, + async_run_remote_workers_only: bool = False, + all_args: Optional[List[Tuple[Any, ...]]] = None, + all_kwargs: Optional[List[Dict[str, Any]]] = None, + use_dummy_driver: bool = False, + max_concurrent_workers: Optional[int] = None, + use_ray_compiled_dag: bool = False, + **kwargs, + ) -> Any: + """Runs the given method on all workers. Can be used in the following + ways: + + - async_run_remote_workers_only: If True the method will be run only + in the remote workers, not the driver worker. It will also be + run asynchronously and return a list of futures rather than blocking + on the results. + - args/kwargs: All workers share the same args/kwargs + - all_args/all_kwargs: args/kwargs for each worker are specified + individually + """ + + if max_concurrent_workers: + raise NotImplementedError( + "max_concurrent_workers is not supported yet.") + + count = len(self.workers) + all_worker_args = repeat(args, count) if all_args is None \ + else islice(all_args, 1, None) + all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \ + else islice(all_kwargs, 1, None) + + # Start the ray workers first. + ray_worker_outputs = [ + worker.execute_method.remote(method, *worker_args, **worker_kwargs) + for (worker, worker_args, worker_kwargs + ) in zip(self.workers, all_worker_args, all_worker_kwargs) + ] + + if async_run_remote_workers_only: + # Just return futures + return ray_worker_outputs + + driver_args = args if all_args is None else all_args[0] + driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0] + + # Start the driver worker after all the ray workers. + if not use_dummy_driver: + driver_worker_output = self.driver_worker.execute_method( + method, *driver_args, **driver_kwargs) + else: + assert self.driver_dummy_worker is not None + driver_worker_output = ray.get( + self.driver_dummy_worker.execute_method.remote( + method, *driver_args, **driver_kwargs)) + # Get the results of the ray workers. + if self.workers: + ray_worker_outputs = ray.get(ray_worker_outputs) + + return [driver_worker_output] + ray_worker_outputs + + def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None: + """Wait for futures returned from _run_workers() with + async_run_remote_workers_only to complete.""" + ray.get(parallel_worker_tasks) + + def determine_num_available_blocks(self) -> Tuple[int, int]: + num_blocks = self._run_workers("determine_num_available_blocks", ) + num_tpu_blocks = min(b[0] for b in num_blocks) + num_cpu_blocks = min(b[1] for b in num_blocks) + return num_tpu_blocks, num_cpu_blocks + + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: + logger.info("# TPU blocks: %d, # CPU blocks: %d", num_gpu_blocks, + num_cpu_blocks) + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + self._run_workers("initialize_cache", + num_gpu_blocks=num_gpu_blocks, + num_cpu_blocks=num_cpu_blocks) + + def execute_model( + self, + execute_model_req: ExecuteModelRequest, + ) -> List[SamplerOutput]: + if self.parallel_worker_tasks is None: + self.parallel_worker_tasks = self._run_workers( + "start_worker_execution_loop", + async_run_remote_workers_only=True, + **self.extra_execute_model_run_workers_kwargs) + + # Only the driver worker returns the sampling results. + return self._driver_execute_model(execute_model_req) + + def stop_remote_worker_execution_loop(self) -> None: + if self.parallel_worker_tasks is None: + return + + self._driver_execute_model() + parallel_worker_tasks = self.parallel_worker_tasks + self.parallel_worker_tasks = None + # Ensure that workers exit model loop cleanly + # (this will raise otherwise) + self._wait_for_tasks_completion(parallel_worker_tasks) + + +class RayTPUExecutorAsync(RayTPUExecutor, ExecutorAsyncBase): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.driver_exec_method = make_async(self.driver_worker.execute_method) + + async def execute_model_async( + self, + execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: + if self.parallel_worker_tasks is None: + # Start model execution loop running in the parallel workers + self.parallel_worker_tasks = asyncio.create_task( + self._start_worker_execution_loop()) + + # Only the driver worker returns the sampling results. + return await self._driver_execute_model_async(execute_model_req) + + async def stop_remote_worker_execution_loop_async(self) -> None: + if self.parallel_worker_tasks is None: + return + + await self._driver_execute_model_async() + parallel_worker_tasks = self.parallel_worker_tasks + self.parallel_worker_tasks = None + # Ensure that workers exit model loop cleanly + # (this will raise otherwise) + await parallel_worker_tasks + + async def _driver_execute_model_async( + self, + execute_model_req: Optional[ExecuteModelRequest] = None + ) -> List[SamplerOutput]: + return await self.driver_exec_method("execute_model", + execute_model_req) + + async def _start_worker_execution_loop(self): + coros = [ + worker.execute_method.remote("start_worker_execution_loop") + for worker in self.workers + ] + return await asyncio.gather(*coros) diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index ead64c0e92553..fe04c6db5fbc2 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -16,6 +16,8 @@ "BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"), # baichuan-7b "BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"), # baichuan-13b "BloomForCausalLM": ("bloom", "BloomForCausalLM"), + "Blip2ForConditionalGeneration": + ("blip2", "Blip2ForConditionalGeneration"), "ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"), "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"), @@ -56,8 +58,8 @@ "OPTForCausalLM": ("opt", "OPTForCausalLM"), "OrionForCausalLM": ("orion", "OrionForCausalLM"), "PersimmonForCausalLM": ("persimmon", "PersimmonForCausalLM"), - "PaliGemmaForConditionalGeneration": - ("paligemma", "PaliGemmaForConditionalGeneration"), + "PaliGemmaForConditionalGeneration": ("paligemma", + "PaliGemmaForConditionalGeneration"), "PhiForCausalLM": ("phi", "PhiForCausalLM"), "Phi3ForCausalLM": ("llama", "LlamaForCausalLM"), "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py new file mode 100644 index 0000000000000..0b124d5e8a85a --- /dev/null +++ b/vllm/model_executor/models/blip.py @@ -0,0 +1,269 @@ +"""Minimal implementation of BlipVisionModel intended to be only used +within a vision language model.""" +from typing import Optional, Union + +import torch +import torch.nn as nn +from PIL import Image +from transformers import Blip2VisionConfig, BlipVisionConfig +from transformers.models.blip.modeling_blip import BlipAttention + +from vllm.config import ModelConfig +from vllm.inputs import LLMInputs +from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.multimodal.image import (cached_get_tokenizer, + repeat_and_pad_image_tokens) +from vllm.sequence import SequenceData + + +def get_blip_patch_grid_length(*, image_size: int, patch_size: int) -> int: + assert image_size % patch_size == 0 + return image_size // patch_size + + +def get_blip_num_patches(*, image_size: int, patch_size: int) -> int: + grid_length = get_blip_patch_grid_length(image_size=image_size, + patch_size=patch_size) + return grid_length * grid_length + + +def get_blip_image_feature_size( + hf_config: Union[BlipVisionConfig, Blip2VisionConfig], ) -> int: + return get_blip_num_patches(image_size=hf_config.image_size, + patch_size=hf_config.patch_size) + + +def get_max_blip_image_tokens( + hf_config: Union[BlipVisionConfig, Blip2VisionConfig], ) -> int: + return get_blip_image_feature_size(hf_config) + + +def dummy_seq_data_for_blip( + hf_config: Union[BlipVisionConfig, Blip2VisionConfig], + seq_len: int, + *, + image_token_id: int, + image_feature_size_override: Optional[int] = None, +): + if image_feature_size_override is None: + image_feature_size = get_blip_image_feature_size(hf_config) + else: + image_feature_size = image_feature_size_override + + token_ids = [image_token_id] * image_feature_size + token_ids += [0] * (seq_len - image_feature_size) + return SequenceData(token_ids) + + +def dummy_image_for_blip( + hf_config: Union[BlipVisionConfig, Blip2VisionConfig], + *, + image_width_override: Optional[int] = None, + image_height_override: Optional[int] = None, +): + width = height = hf_config.image_size + if image_width_override is not None: + width = image_width_override + if image_height_override is not None: + height = image_height_override + + image = Image.new("RGB", (width, height), color=0) + return {"image": image} + + +def input_processor_for_blip( + model_config: ModelConfig, + hf_config: Union[BlipVisionConfig, Blip2VisionConfig], + llm_inputs: LLMInputs, + *, + image_token_id: int, + image_feature_size_override: Optional[int] = None, +): + multi_modal_data = llm_inputs.get("multi_modal_data") + if multi_modal_data is None or "image" not in multi_modal_data: + return llm_inputs + + tokenizer = cached_get_tokenizer(model_config.tokenizer) + + if image_feature_size_override is None: + image_feature_size = get_blip_image_feature_size(hf_config) + else: + image_feature_size = image_feature_size_override + + new_prompt, new_token_ids = repeat_and_pad_image_tokens( + tokenizer, + llm_inputs.get("prompt"), + llm_inputs["prompt_token_ids"], + image_token_id=image_token_id, + repeat_count=image_feature_size, + ) + + # NOTE: Create a defensive copy of the original inputs + return LLMInputs(prompt_token_ids=new_token_ids, + prompt=new_prompt, + multi_modal_data=multi_modal_data) + + +# Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/blip/modeling_blip.py#L164 # noqa +class BlipVisionEmbeddings(nn.Module): + + def __init__(self, config: BlipVisionConfig): + super().__init__() + + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim)) + + self.patch_embedding = nn.Conv2d( + in_channels=3, + out_channels=self.embed_dim, + kernel_size=self.patch_size, + stride=self.patch_size, + ) + + self.num_patches = get_blip_num_patches(image_size=self.image_size, + patch_size=self.patch_size) + self.num_positions = self.num_patches + 1 + + self.position_embedding = nn.Parameter( + torch.randn(1, self.num_positions, self.embed_dim)) + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + batch_size = pixel_values.shape[0] + target_dtype = self.patch_embedding.weight.dtype + patch_embeds = self.patch_embedding(pixel_values.to( + dtype=target_dtype)) # shape = [*, width, grid, grid] + patch_embeds = patch_embeds.flatten(2).transpose(1, 2) + + class_embeds = self.class_embedding.expand(batch_size, 1, -1) + embeddings = torch.cat([class_embeds, patch_embeds], dim=1) + + position_embeds = self.position_embedding.to(target_dtype) + embeddings = embeddings + position_embeds[:, :embeddings.size(1), :] + + return embeddings + + +class BlipMLP(nn.Module): + + def __init__(self, + config: BlipVisionConfig, + quant_config: Optional[QuantizationConfig] = None): + super().__init__() + + self.config = config + + self.activation_fn = get_act_fn(config.hidden_act) + self.fc1 = ColumnParallelLinear(config.hidden_size, + config.intermediate_size, + bias=True, + quant_config=quant_config) + self.fc2 = RowParallelLinear(config.intermediate_size, + config.hidden_size, + bias=True, + quant_config=quant_config) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states, _ = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states, _ = self.fc2(hidden_states) + + return hidden_states + + +class BlipEncoderLayer(nn.Module): + + def __init__(self, + config: BlipVisionConfig, + quant_config: Optional[QuantizationConfig] = None): + super().__init__() + + self.self_attn = BlipAttention(config) + self.layer_norm1 = nn.LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + self.mlp = BlipMLP(config, quant_config=quant_config) + self.layer_norm2 = nn.LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + residual = hidden_states + + hidden_states = self.layer_norm1(hidden_states) + hidden_states, _ = self.self_attn(hidden_states=hidden_states) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + return hidden_states + + +class BlipEncoder(nn.Module): + """ + Transformer encoder consisting of `config.num_hidden_layers` self + attention layers. Each layer is a [`BlipEncoderLayer`]. + + Args: + config: BlipConfig + """ + + def __init__(self, + config: BlipVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + num_hidden_layers_override: Optional[int] = None): + super().__init__() + + self.config = config + + if num_hidden_layers_override is None: + num_hidden_layers = config.num_hidden_layers + else: + num_hidden_layers = num_hidden_layers_override + + self.layers = nn.ModuleList([ + BlipEncoderLayer(config=config, quant_config=quant_config) + for _ in range(num_hidden_layers) + ]) + + def forward(self, inputs_embeds: torch.Tensor): + hidden_states = inputs_embeds + for encoder_layer in self.layers: + hidden_states = encoder_layer(hidden_states) + + return hidden_states + + +class BlipVisionModel(nn.Module): + config_class = BlipVisionConfig + main_input_name = "pixel_values" + + def __init__(self, + config: BlipVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + num_hidden_layers_override: Optional[int] = None): + super().__init__() + + self.config = config + + self.embeddings = BlipVisionEmbeddings(config) + self.encoder = BlipEncoder( + config=config, + quant_config=quant_config, + num_hidden_layers_override=num_hidden_layers_override, + ) + self.post_layernorm = nn.LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + hidden_states = self.embeddings(pixel_values) + hidden_states = self.encoder(inputs_embeds=hidden_states) + + return self.post_layernorm(hidden_states) diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py new file mode 100644 index 0000000000000..e00e6c0806957 --- /dev/null +++ b/vllm/model_executor/models/blip2.py @@ -0,0 +1,669 @@ +from typing import Iterable, List, Literal, Optional, Tuple, TypedDict + +import torch +import torch.nn as nn +from transformers import (Blip2Config, Blip2QFormerConfig, Blip2VisionConfig, + apply_chunking_to_forward) + +from vllm.attention import AttentionMetadata +from vllm.config import CacheConfig, MultiModalConfig +from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs +from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.opt import OPTModel +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.sequence import IntermediateTensors, SamplerOutput, SequenceData + +from .blip import (BlipVisionModel, dummy_image_for_blip, + get_max_blip_image_tokens) +from .interfaces import SupportsVision +from .utils import merge_vision_embeddings + +_KEYS_TO_MODIFY_MAPPING = { + "language_model.lm_head": "lm_head", + "language_model.model": "language_model", +} + + +class Blip2QFormerMultiHeadAttention(nn.Module): + + def __init__( + self, + config: Blip2QFormerConfig, + *, + quant_config: Optional[QuantizationConfig], + cache_config: Optional[CacheConfig], + is_cross_attention: bool = False, + ) -> None: + super().__init__() + + self.config = config + + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of " + f"the number of attention heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = (config.hidden_size // + config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.scaling = self.attention_head_size**-0.5 + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + if is_cross_attention: + kv_hidden_size = config.encoder_hidden_size + else: + kv_hidden_size = config.hidden_size + self.key = nn.Linear(kv_hidden_size, self.all_head_size) + self.value = nn.Linear(kv_hidden_size, self.all_head_size) + + self.position_embedding_type = getattr(config, + "position_embedding_type", + "absolute") + if self.position_embedding_type != "absolute": + raise NotImplementedError("Unsupported position_embedding_type: " + f"{self.position_embedding_type}") + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def transpose_for_scores(self, x): + x = x.view(*x.size()[:-1], self.num_attention_heads, + self.attention_head_size) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + ): + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention: + key_layer = self.transpose_for_scores( + self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores( + self.value(encoder_hidden_states)) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + mixed_query_layer = self.query(hidden_states) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + attention_scores = torch.matmul(query_layer, + key_layer.transpose(-1, -2)) + attention_probs = torch.softmax(attention_scores * self.scaling, + dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs_dropped = self.dropout(attention_probs) + + context_layer = torch.matmul(attention_probs_dropped, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + context_layer = context_layer.view(*context_layer.size()[:-2], + self.all_head_size) + + return context_layer + + +class Blip2QFormerSelfOutput(nn.Module): + + def __init__(self, config: Blip2QFormerConfig) -> None: + super().__init__() + + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward( + self, + hidden_states: torch.Tensor, + input_tensor: torch.Tensor, + ) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class Blip2QFormerAttention(nn.Module): + + def __init__( + self, + config: Blip2QFormerConfig, + *, + quant_config: Optional[QuantizationConfig], + cache_config: Optional[CacheConfig], + is_cross_attention: bool = False, + ) -> None: + super().__init__() + + self.attention = Blip2QFormerMultiHeadAttention( + config, + quant_config=quant_config, + cache_config=cache_config, + is_cross_attention=is_cross_attention, + ) + + self.output = Blip2QFormerSelfOutput(config) + + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + ) -> Tuple[torch.Tensor]: + self_output = self.attention( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + ) + attention_output = self.output(self_output, hidden_states) + + return attention_output + + +class Blip2QFormerIntermediate(nn.Module): + + def __init__(self, config: Blip2QFormerConfig) -> None: + super().__init__() + + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + self.intermediate_act_fn = get_act_fn(config.hidden_act) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class Blip2QFormerOutput(nn.Module): + + def __init__(self, config: Blip2QFormerConfig) -> None: + super().__init__() + + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward( + self, + hidden_states: torch.Tensor, + input_tensor: torch.Tensor, + ) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class Blip2QFormerLayer(nn.Module): + + def __init__( + self, + config: Blip2QFormerConfig, + *, + quant_config: Optional[QuantizationConfig], + cache_config: Optional[CacheConfig], + layer_idx: int, + ) -> None: + super().__init__() + + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = Blip2QFormerAttention(config, + quant_config=quant_config, + cache_config=cache_config) + + self.layer_idx = layer_idx + + if layer_idx % config.cross_attention_frequency == 0: + self.crossattention = Blip2QFormerAttention( + config, + quant_config=quant_config, + cache_config=cache_config, + is_cross_attention=True) + self.has_cross_attention = True + else: + self.has_cross_attention = False + + self.intermediate_query = Blip2QFormerIntermediate(config) + self.output_query = Blip2QFormerOutput(config) + + def forward( + self, + hidden_states: torch.FloatTensor, + encoder_hidden_states: torch.FloatTensor, + query_length: int, + ): + attention_output = self.attention(hidden_states) + + if query_length > 0: + query_attention_output = attention_output[:, :query_length, :] + + if self.has_cross_attention: + query_attention_output = self.crossattention( + query_attention_output, + encoder_hidden_states=encoder_hidden_states, + ) + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk_query, + self.chunk_size_feed_forward, + self.seq_len_dim, + query_attention_output, + ) + + if attention_output.shape[1] > query_length: + layer_output_text = apply_chunking_to_forward( + self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output[:, query_length:, :], + ) + layer_output = torch.cat([layer_output, layer_output_text], + dim=1) + else: + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output, + ) + + return layer_output + + def feed_forward_chunk(self, + attention_output: torch.Tensor) -> torch.Tensor: + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + def feed_forward_chunk_query( + self, attention_output: torch.Tensor) -> torch.Tensor: + intermediate_output = self.intermediate_query(attention_output) + layer_output = self.output_query(intermediate_output, attention_output) + return layer_output + + +class Blip2QFormerEncoder(nn.Module): + + def __init__( + self, + config: Blip2QFormerConfig, + *, + quant_config: Optional[QuantizationConfig], + cache_config: Optional[CacheConfig], + ) -> None: + super().__init__() + + self.config = config + + self.layer = nn.ModuleList([ + Blip2QFormerLayer(config, + quant_config=quant_config, + cache_config=cache_config, + layer_idx=layer_idx) + for layer_idx in range(config.num_hidden_layers) + ]) + + def forward( + self, + hidden_states: torch.FloatTensor, + encoder_hidden_states: torch.FloatTensor, + query_length: int, + ) -> torch.Tensor: + for i in range(self.config.num_hidden_layers): + layer_module = self.layer[i] + + hidden_states = layer_module( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + query_length=query_length, + ) + + return hidden_states + + +# Adapted from https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/models/blip_2/modeling_blip_2.py#L1025 +class Blip2QFormerModel(nn.Module): + + def __init__( + self, + config: Blip2QFormerConfig, + *, + quant_config: Optional[QuantizationConfig], + cache_config: Optional[CacheConfig], + ) -> None: + super().__init__() + + self.config = config + + self.layernorm = nn.LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + self.encoder = Blip2QFormerEncoder(config, + quant_config=quant_config, + cache_config=cache_config) + + def forward( + self, + query_embeds: torch.FloatTensor, + encoder_hidden_states: torch.FloatTensor, + ) -> torch.Tensor: + query_length = query_embeds.shape[1] + + embedding_output = self.layernorm(query_embeds) + embedding_output = self.dropout(embedding_output) + + sequence_output = self.encoder( + embedding_output, + encoder_hidden_states=encoder_hidden_states, + query_length=query_length, + ) + + return sequence_output + + +class Blip2ImagePixelInputs(TypedDict): + type: Literal["pixel_values"] + data: torch.Tensor + """Shape: (batch_size, num_channels, height, width)""" + + +Blip2ImageInputs = Blip2ImagePixelInputs + +# We use this internally as placeholders since there is no image token +# defined on the HuggingFace repo +BLIP2_IMAGE_TOKEN = "" +BLIP2_IMAGE_TOKEN_ID = 50265 + + +def get_blip2_image_feature_size(hf_config: Blip2Config) -> int: + return hf_config.num_query_tokens + + +def get_max_blip2_image_tokens(ctx: InputContext): + hf_config = ctx.get_hf_config(Blip2Config) + vision_config = hf_config.vision_config + + if isinstance(vision_config, Blip2VisionConfig): + return get_max_blip_image_tokens(vision_config) + + msg = f"Unsupported vision config: {type(vision_config)}" + raise NotImplementedError(msg) + + +def dummy_data_for_blip2(ctx: InputContext, seq_len: int): + hf_config = ctx.get_hf_config(Blip2Config) + vision_config = hf_config.vision_config + + image_feature_size = get_blip2_image_feature_size(hf_config) + token_ids = [BLIP2_IMAGE_TOKEN_ID] * image_feature_size + token_ids += [0] * (seq_len - image_feature_size) + seq_data = SequenceData(token_ids) + + if isinstance(vision_config, Blip2VisionConfig): + mm_data = dummy_image_for_blip(vision_config) + + return seq_data, mm_data + + msg = f"Unsupported vision config: {type(vision_config)}" + raise NotImplementedError(msg) + + +def input_processor_for_blip2(ctx: InputContext, llm_inputs: LLMInputs): + multi_modal_data = llm_inputs.get("multi_modal_data") + if multi_modal_data is None or "image" not in multi_modal_data: + return llm_inputs + + hf_config = ctx.get_hf_config(Blip2Config) + image_feature_size = get_blip2_image_feature_size(hf_config) + + # The original model places image tokens at the front + # https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/models/blip_2/modeling_blip_2.py#L1514 + new_token_ids = [BLIP2_IMAGE_TOKEN_ID] * image_feature_size + new_token_ids += llm_inputs["prompt_token_ids"] + + new_prompt = llm_inputs.get("prompt") + if new_prompt is not None: + new_prompt = BLIP2_IMAGE_TOKEN * image_feature_size + new_prompt + + return LLMInputs(prompt_token_ids=new_token_ids, + prompt=new_prompt, + multi_modal_data=multi_modal_data) + + +@MULTIMODAL_REGISTRY.register_image_input_mapper() +@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_blip2_image_tokens) +@INPUT_REGISTRY.register_dummy_data(dummy_data_for_blip2) +@INPUT_REGISTRY.register_input_processor(input_processor_for_blip2) +class Blip2ForConditionalGeneration(nn.Module, SupportsVision): + + def __init__(self, + config: Blip2Config, + multimodal_config: MultiModalConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None) -> None: + + super().__init__() + + self.config = config + self.multimodal_config = multimodal_config + + # TODO: Optionally initializes this for supporting embeddings. + self.vision_model = BlipVisionModel(config.vision_config) + + self.query_tokens = nn.Parameter( + torch.zeros(1, config.num_query_tokens, + config.qformer_config.hidden_size)) + + self.qformer = Blip2QFormerModel(config.qformer_config, + cache_config=cache_config, + quant_config=quant_config) + + self.language_projection = nn.Linear( + config.qformer_config.hidden_size, + config.text_config.hidden_size, + bias=True, + ) + + self.quant_config = quant_config + + self.language_model = OPTModel(config.text_config, cache_config, + quant_config) + + self.unpadded_vocab_size = config.text_config.vocab_size + self.logits_processor = LogitsProcessor(self.unpadded_vocab_size) + self.sampler = Sampler() + + def get_lm_head(self): + return self.language_model.decoder.embed_tokens + + def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: + h = w = self.config.vision_config.image_size + expected_dims = (3, h, w) + actual_dims = tuple(data.shape[1:]) + + if actual_dims != expected_dims: + expected_expr = ("batch_size", *map(str, expected_dims)) + raise ValueError( + f"The expected shape of pixel values is {expected_expr}. " + f"You supplied {tuple(data.shape)}.") + + return data + + def _parse_and_validate_image_input( + self, **kwargs: object) -> Optional[Blip2ImageInputs]: + pixel_values = kwargs.pop("pixel_values", None) + + if pixel_values is None: + return None + + if not isinstance(pixel_values, torch.Tensor): + raise ValueError("Incorrect type of pixel values. " + f"Got type: {type(pixel_values)}") + + return Blip2ImagePixelInputs( + type="pixel_values", + data=self._validate_pixel_values(pixel_values), + ) + + def _image_pixels_to_features(self, vision_model: BlipVisionModel, + pixel_values: torch.Tensor) -> torch.Tensor: + + # NOTE: we skip the step to select the vision feature layer since + # this is already done inside the vision tower + image_features = vision_model(pixel_values) + + return image_features + + def _process_image_pixels(self, + inputs: Blip2ImagePixelInputs) -> torch.Tensor: + assert self.vision_model is not None + + pixel_values = inputs["data"] + + return self._image_pixels_to_features(self.vision_model, pixel_values) + + def _process_image_input(self, + image_input: Blip2ImageInputs) -> torch.Tensor: + assert self.vision_model is not None + image_features = self._process_image_pixels(image_input) + + query_tokens = self.query_tokens.expand(image_features.shape[0], -1, + -1) + query_output = self.qformer( + query_embeds=query_tokens, + encoder_hidden_states=image_features, + ) + + return self.language_projection(query_output) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + **kwargs: object, + ) -> SamplerOutput: + """Run forward pass for BLIP-2. + + One key thing to understand is the `input_ids` already accounts for the + positions of the to-be-inserted image embeddings. + + Concretely, consider a text prompt: + `"Question: What's the content of the image? Answer:"`. + + Tokenizer outputs: + `[2, 45641, 35, 653, 18, 5, 1383, 9, 5, 2274, 116, 31652, 35]`. + + To reserve space in KV cache, we have to insert placeholder tokens + before they are inputted to the model, so the input processor prepends + dummy tokens (denoted as `50265`), resulting in: + `[50265, ..., 50265, 2, 45641, 35, ..., 31652, 35]`. + + We insert 32 tokens since it corresponds to the number of query + embeddings outputted by the Q-Former and inputted to the language model. + + This way, the `positions` and `attn_metadata` are consistent + with the `input_ids`. + + Args: + input_ids: Flattened (concatenated) input_ids corresponding to a + batch. + pixel_values: The pixels in each input image. + + See also: + :class:`Blip2ImageInputs` + """ + image_input = self._parse_and_validate_image_input(**kwargs) + + if image_input is not None: + vision_embeddings = self._process_image_input(image_input) + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + + inputs_embeds = merge_vision_embeddings(input_ids, inputs_embeds, + vision_embeddings, + BLIP2_IMAGE_TOKEN_ID) + + input_ids = None + else: + inputs_embeds = None + + hidden_states = self.language_model(input_ids, + positions, + kv_caches, + attn_metadata, + inputs_embeds=inputs_embeds) + + return hidden_states + + def compute_logits(self, hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata) -> torch.Tensor: + logits = self.logits_processor(self.get_lm_head(), hidden_states, + sampling_metadata) + return logits + + def sample( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(logits, sampling_metadata) + return next_tokens + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + # only doing this for language model part for now. + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + params_dict = dict(self.named_parameters()) + + for name, loaded_weight in weights: + if "lm_head.weight" in name: + continue + if "rotary_emb.inv_freq" in name: + continue + for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items(): + if key_to_modify in name: + name = name.replace(key_to_modify, new_key) + use_default_weight_loading = False + if "vision" in name: + if self.vision_model is not None: + # We only do sharding for language model and + # not vision model for now. + use_default_weight_loading = True + else: + for (param_name, weight_name, + shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + param = params_dict[name.replace(weight_name, param_name)] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + use_default_weight_loading = True + if use_default_weight_loading: + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index d4e4f0055aa2b..3444578227259 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -644,6 +644,11 @@ def __init__( lora_config: Optional[LoRAConfig] = None, scheduler_config: Optional[SchedulerConfig] = None, ) -> None: + assert not scheduler_config.chunked_prefill_enabled, \ + "Jamba currently does not support chunked prefill" + assert not cache_config.enable_prefix_caching, \ + "Jamba currently does not support prefix caching" + super().__init__() self.config = config self.scheduler_config = scheduler_config diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index edc16710c0229..a05090cd46648 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -237,14 +237,19 @@ def __init__( for _ in range(config.num_hidden_layers) ]) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, + inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: - inputs_embeds = self.embed_tokens(input_ids) + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings(input_ids) pos_embeds = self.embed_positions(positions) if self.project_in is not None: inputs_embeds, _ = self.project_in(inputs_embeds) @@ -272,14 +277,22 @@ def __init__( super().__init__() self.decoder = OPTDecoder(config, cache_config, quant_config) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.decoder.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, + inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: - return self.decoder(input_ids, positions, kv_caches, attn_metadata) + return self.decoder(input_ids, + positions, + kv_caches, + attn_metadata, + inputs_embeds=inputs_embeds) class OPTForCausalLM(nn.Module): diff --git a/vllm/utils.py b/vllm/utils.py index a6d5b8feb6a01..b7589ca50ba5b 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -511,6 +511,12 @@ def create_kv_caches_with_random( seed: int = 0, device: Optional[str] = "cuda", ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: + + if cache_dtype == "fp8" and head_size % 16: + raise ValueError( + f"Does not support key cache of type fp8 with head_size {head_size}" + ) + torch.random.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index 8a8b412db6731..e5bb101fc7df4 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -1,6 +1,7 @@ import time from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union +from unittest.mock import patch import numpy as np import torch @@ -45,6 +46,7 @@ class ModelInputForTPU(ModelRunnerInputBase): num_samples: int best_of: List[int] seq_groups: List[List[int]] + virtual_engine: int = 0 def as_broadcastable_tensor_dict( self) -> Dict[str, Union[int, torch.Tensor]]: @@ -55,6 +57,9 @@ def as_broadcastable_tensor_dict( "t": self.t, "p": self.p, "num_samples": self.num_samples, + "best_of": self.best_of, + "seq_groups": self.seq_groups, + "virtual_engine": self.virtual_engine, } _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) return tensor_dict @@ -113,16 +118,30 @@ def __init__( def load_model(self) -> None: self.device = self.device_config.device - model = get_model( - model_config=self.model_config, - load_config=self.load_config, - device_config=self.device_config, - parallel_config=self.parallel_config, - cache_config=self.cache_config, - scheduler_config=self.scheduler_config, - multimodal_config=self.multimodal_config, - lora_config=None, - ) + # NOTE(woosuk): While the executor assigns the TP ranks to the worker + # process, the ranks can be different from the ranks internally assigned + # by the xm runtime. Therefore, there is a mismatch in the rank + # assignment between the gloo (cpu) runtime and the xm (tpu) runtime. + # This is not a problem in linear layers because all-reduce is + # rank-agnostic. However, it matters for all-gather as the ranks + # determine the order of concatenating the output tensors. + # As a workaround, we use the xm's rank assignment only when loading + # the embedding weights. + xm_tp_rank = xm.get_ordinal() + with patch( + "vllm.model_executor.layers.vocab_parallel_embedding." + "get_tensor_model_parallel_rank", + return_value=xm_tp_rank): + model = get_model( + model_config=self.model_config, + load_config=self.load_config, + device_config=self.device_config, + parallel_config=self.parallel_config, + cache_config=self.cache_config, + scheduler_config=self.scheduler_config, + multimodal_config=self.multimodal_config, + lora_config=None, + ) model = model.eval() xm.wait_device_ops() @@ -463,10 +482,11 @@ def make_model_input_from_broadcasted_tensor_dict( tensor_dict, attn_backend=self.attn_backend) return model_input + @torch.no_grad() def execute_model( self, model_input: ModelInputForTPU, - kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], + kv_caches: Optional[List[Any]], intermediate_tensors: Optional[IntermediateTensors] = None, num_steps: int = 1, ) -> List[SamplerOutput]: diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py index 03011e03058d8..c88aba7ae08cd 100644 --- a/vllm/worker/tpu_worker.py +++ b/vllm/worker/tpu_worker.py @@ -70,13 +70,13 @@ def __init__( def init_device(self) -> None: os.environ["PJRT_DEVICE"] = "TPU" - self.device = xm.xla_device() - self.device_config.device = self.device torch.set_grad_enabled(False) torch.set_default_dtype(self.model_config.dtype) - # NOTE(woosuk): This is just a hack to initialize the TP group. - # This cannot perform the actual communication ops. + # NOTE(woosuk): This is just to initialize the TP group and broadcast + # the input objects on CPU. The all-reduce and all-gather ops on TPU + # are invoked by `xm.all_reduce` and `xm.all_gather` which use their + # own context. init_distributed_environment( world_size=self.parallel_config.world_size, rank=self.rank, @@ -88,6 +88,11 @@ def init_device(self) -> None: self.parallel_config.tensor_parallel_size, self.parallel_config.pipeline_parallel_size) + # Device initialization should happen after initializing the distributed + # runtime. + self.device = xm.xla_device() + self.device_config.device = self.device + # Set random seed. set_random_seed(self.model_config.seed) xm.set_rng_state(self.model_config.seed, self.device) @@ -200,8 +205,7 @@ def get_cache_block_size_bytes(self) -> int: @property def do_metadata_broadcast(self) -> bool: - # TODO(woosuk): Support TP. - return False + return self.parallel_config.tensor_parallel_size > 1 @property def kv_cache(self) -> Optional[List[List[torch.Tensor]]]: