Merge branch 'upstream' into mypy-follow-imports

vllm-project · Jul 27, 2024 · 84e56fe · 84e56fe
2 parents a64f9f9 + 1ad86ac
commit 84e56fe
Show file tree

Hide file tree

Showing 46 changed files with 1,807 additions and 275 deletions.
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
@@ -23,7 +23,7 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
 # Run basic model test
 docker exec cpu-test bash -c "
   pip install pytest Pillow protobuf
-  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py" # Mamba on CPU is not supported
+  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
 
 # online inference
 docker exec cpu-test bash -c "

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -140,14 +140,13 @@ steps:
   working_dir: "/vllm-workspace/examples"
   mirror_hardwares: [amd]
   commands:
-    # install aws cli for llava_example.py
     # install tensorizer for tensorize_vllm_model.py
     - pip install awscli tensorizer
     - python3 offline_inference.py
     - python3 cpu_offload.py
     - python3 offline_inference_with_prefix.py
     - python3 llm_engine_example.py
-    - python3 llava_example.py
+    - python3 offline_inference_vision_language.py
     - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
 
 - label: Inputs Test
@@ -220,7 +219,6 @@ steps:
 
 - label: Tensorizer Test
   #mirror_hardwares: [amd]
-  soft_fail: true
   fast_check: true
   commands:
     - apt-get install -y curl libsodium23

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -10,6 +10,7 @@ build:
 
 sphinx:
    configuration: docs/source/conf.py
+   fail_on_warning: true
 
 # If using Sphinx, optionally build your docs in additional formats such as PDF
 formats:

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
@@ -55,8 +55,8 @@ RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
         *"rocm-6.1"*) \
             python3 -m pip uninstall -y torch torchvision \
             && python3 -m pip install --no-cache-dir --pre \
-                torch==2.5.0.dev20240710 \
-                torchvision==0.20.0.dev20240710 \
+                torch==2.5.0.dev20240726 \
+                torchvision==0.20.0.dev20240726 \
                --index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \
         *) ;; esac
 

diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
@@ -175,7 +175,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
     parser.add_argument("--num-kv-heads", type=int, default=8)
     parser.add_argument("--head-size",
                         type=int,
-                        choices=[64, 80, 96, 112, 128, 192, 256],
+                        choices=[64, 80, 96, 112, 120, 128, 192, 256],
                         default=128)
     parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
     parser.add_argument("--use-alibi", action="store_true")

diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py
@@ -94,7 +94,7 @@ def benchmark_rope_kernels_multi_lora(
     parser.add_argument("--num-heads", type=int, default=8)
     parser.add_argument("--head-size",
                         type=int,
-                        choices=[64, 80, 96, 112, 128, 192, 256],
+                        choices=[64, 80, 96, 112, 120, 128, 192, 256],
                         default=128)
     parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32)
     parser.add_argument("--dtype",

diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu
@@ -751,6 +751,9 @@ void paged_attention_v1_launcher(
     case 112:
       LAUNCH_PAGED_ATTENTION_V1(112);
       break;
+    case 120:
+      LAUNCH_PAGED_ATTENTION_V1(120);
+      break;
     case 128:
       LAUNCH_PAGED_ATTENTION_V1(128);
       break;
@@ -912,6 +915,9 @@ void paged_attention_v2_launcher(
     case 112:
       LAUNCH_PAGED_ATTENTION_V2(112);
       break;
+    case 120:
+      LAUNCH_PAGED_ATTENTION_V2(120);
+      break;
     case 128:
       LAUNCH_PAGED_ATTENTION_V2(128);
       break;

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -94,6 +94,7 @@ def setup(app):
 
 # Mock out external dependencies here, otherwise the autodoc pages may be blank.
 autodoc_mock_imports = [
+    "aiohttp",
     "cpuinfo",
     "torch",
     "transformers",
@@ -141,5 +142,6 @@ def add_line(self, line: str, source: str, *lineno: int) -> None:
 }
 
 autodoc_preserve_defaults = True
+autodoc_warningiserror = True
 
 navigation_with_keys = False
diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst
@@ -117,7 +117,7 @@ Alternatively, wheels intended for vLLM use can be accessed under the releases.
 
         $ # Install PyTorch
         $ pip uninstall torch -y
-        $ pip install --no-cache-dir --pre torch==2.5.0.dev20240710 --index-url https://download.pytorch.org/whl/nightly/rocm6.1
+        $ pip install --no-cache-dir --pre torch==2.5.0.dev20240726 --index-url https://download.pytorch.org/whl/nightly/rocm6.1
 
         $ # Build & install AMD SMI
         $ pip install /opt/rocm/share/amd_smi

diff --git a/docs/source/getting_started/neuron-installation.rst b/docs/source/getting_started/neuron-installation.rst
@@ -131,6 +131,6 @@ Once neuronx-cc and transformers-neuronx packages are installed, we will be able
     $ git clone https://github.com/vllm-project/vllm.git
     $ cd vllm
     $ pip install -U -r requirements-neuron.txt
-    $ pip install .
+    $ VLLM_TARGET_DEVICE="neuron" pip install .
 
 If neuron packages are detected correctly in the installation process, ``vllm-0.3.0+neuron212`` will be installed.
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
@@ -7,6 +7,8 @@ vLLM supports a variety of generative Transformer models in `HuggingFace Transfo
 The following is the list of model architectures that are currently supported by vLLM.
 Alongside each architecture, we include some popular models that use it.
 
+----
+
 Decoder-only Language Models
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. list-table::
@@ -186,6 +188,10 @@ Vision Language Models
     - Models
     - Example HuggingFace Models
     - :ref:`LoRA <lora>`
+  * - :code:`Blip2ForConditionalGeneration`
+    - BLIP-2
+    - :code:`Salesforce/blip2-opt-2.7b`, :code:`Salesforce/blip2-opt-6.7b`, etc.
+    -
   * - :code:`ChameleonForConditionalGeneration`
     - Chameleon
     - :code:`facebook/chameleon-7b` etc.
@@ -215,6 +221,8 @@ Vision Language Models
     - :code:`openbmb/MiniCPM-V-2`, :code:`openbmb/MiniCPM-Llama3-V-2_5`, etc.
     -
 
+----
+
 If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
 Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` and :ref:`Enabling Multimodal Inputs <enabling_multimodal_inputs>` 
 for instructions on how to implement support for your model.

diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
@@ -73,7 +73,7 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptI
         generated_text = o.outputs[0].text
         print(generated_text)
 
-A code example can be found in `examples/llava_example.py <https://github.com/vllm-project/vllm/blob/main/examples/llava_example.py>`_.
+A code example can be found in `examples/offline_inference_vision_language.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language.py>`_.
 
 
 Online OpenAI Vision API Compatible Inference

diff --git a/examples/api_client.py b/examples/api_client.py
@@ -31,7 +31,10 @@ def post_http_request(prompt: str,
         "max_tokens": 16,
         "stream": stream,
     }
-    response = requests.post(api_url, headers=headers, json=pload, stream=True)
+    response = requests.post(api_url,
+                             headers=headers,
+                             json=pload,
+                             stream=stream)
     return response
 
 

diff --git a/examples/fuyu_example.py b/examples/fuyu_example.py
diff --git a/examples/llava_example.py b/examples/llava_example.py
diff --git a/examples/llava_next_example.py b/examples/llava_next_example.py
diff --git a/examples/minicpmv_example.py b/examples/minicpmv_example.py