Skip to content

Commit

Permalink
Merge branch 'upstream' into mypy-follow-imports
Browse files Browse the repository at this point in the history
  • Loading branch information
DarkLight1337 committed Jul 27, 2024
2 parents a64f9f9 + 1ad86ac commit 84e56fe
Show file tree
Hide file tree
Showing 46 changed files with 1,807 additions and 275 deletions.
2 changes: 1 addition & 1 deletion .buildkite/run-cpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
# Run basic model test
docker exec cpu-test bash -c "
pip install pytest Pillow protobuf
pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py" # Mamba on CPU is not supported
pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported

# online inference
docker exec cpu-test bash -c "
Expand Down
4 changes: 1 addition & 3 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -140,14 +140,13 @@ steps:
working_dir: "/vllm-workspace/examples"
mirror_hardwares: [amd]
commands:
# install aws cli for llava_example.py
# install tensorizer for tensorize_vllm_model.py
- pip install awscli tensorizer
- python3 offline_inference.py
- python3 cpu_offload.py
- python3 offline_inference_with_prefix.py
- python3 llm_engine_example.py
- python3 llava_example.py
- python3 offline_inference_vision_language.py
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors

- label: Inputs Test
Expand Down Expand Up @@ -220,7 +219,6 @@ steps:

- label: Tensorizer Test
#mirror_hardwares: [amd]
soft_fail: true
fast_check: true
commands:
- apt-get install -y curl libsodium23
Expand Down
1 change: 1 addition & 0 deletions .readthedocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ build:

sphinx:
configuration: docs/source/conf.py
fail_on_warning: true

# If using Sphinx, optionally build your docs in additional formats such as PDF
formats:
Expand Down
4 changes: 2 additions & 2 deletions Dockerfile.rocm
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@ RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
*"rocm-6.1"*) \
python3 -m pip uninstall -y torch torchvision \
&& python3 -m pip install --no-cache-dir --pre \
torch==2.5.0.dev20240710 \
torchvision==0.20.0.dev20240710 \
torch==2.5.0.dev20240726 \
torchvision==0.20.0.dev20240726 \
--index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \
*) ;; esac

Expand Down
2 changes: 1 addition & 1 deletion benchmarks/kernels/benchmark_paged_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
parser.add_argument("--num-kv-heads", type=int, default=8)
parser.add_argument("--head-size",
type=int,
choices=[64, 80, 96, 112, 128, 192, 256],
choices=[64, 80, 96, 112, 120, 128, 192, 256],
default=128)
parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
parser.add_argument("--use-alibi", action="store_true")
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/kernels/benchmark_rope.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def benchmark_rope_kernels_multi_lora(
parser.add_argument("--num-heads", type=int, default=8)
parser.add_argument("--head-size",
type=int,
choices=[64, 80, 96, 112, 128, 192, 256],
choices=[64, 80, 96, 112, 120, 128, 192, 256],
default=128)
parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32)
parser.add_argument("--dtype",
Expand Down
6 changes: 6 additions & 0 deletions csrc/attention/attention_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -751,6 +751,9 @@ void paged_attention_v1_launcher(
case 112:
LAUNCH_PAGED_ATTENTION_V1(112);
break;
case 120:
LAUNCH_PAGED_ATTENTION_V1(120);
break;
case 128:
LAUNCH_PAGED_ATTENTION_V1(128);
break;
Expand Down Expand Up @@ -912,6 +915,9 @@ void paged_attention_v2_launcher(
case 112:
LAUNCH_PAGED_ATTENTION_V2(112);
break;
case 120:
LAUNCH_PAGED_ATTENTION_V2(120);
break;
case 128:
LAUNCH_PAGED_ATTENTION_V2(128);
break;
Expand Down
2 changes: 2 additions & 0 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ def setup(app):

# Mock out external dependencies here, otherwise the autodoc pages may be blank.
autodoc_mock_imports = [
"aiohttp",
"cpuinfo",
"torch",
"transformers",
Expand Down Expand Up @@ -141,5 +142,6 @@ def add_line(self, line: str, source: str, *lineno: int) -> None:
}

autodoc_preserve_defaults = True
autodoc_warningiserror = True

navigation_with_keys = False
2 changes: 1 addition & 1 deletion docs/source/getting_started/amd-installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ Alternatively, wheels intended for vLLM use can be accessed under the releases.
$ # Install PyTorch
$ pip uninstall torch -y
$ pip install --no-cache-dir --pre torch==2.5.0.dev20240710 --index-url https://download.pytorch.org/whl/nightly/rocm6.1
$ pip install --no-cache-dir --pre torch==2.5.0.dev20240726 --index-url https://download.pytorch.org/whl/nightly/rocm6.1
$ # Build & install AMD SMI
$ pip install /opt/rocm/share/amd_smi
Expand Down
2 changes: 1 addition & 1 deletion docs/source/getting_started/neuron-installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,6 @@ Once neuronx-cc and transformers-neuronx packages are installed, we will be able
$ git clone https://github.com/vllm-project/vllm.git
$ cd vllm
$ pip install -U -r requirements-neuron.txt
$ pip install .
$ VLLM_TARGET_DEVICE="neuron" pip install .
If neuron packages are detected correctly in the installation process, ``vllm-0.3.0+neuron212`` will be installed.
8 changes: 8 additions & 0 deletions docs/source/models/supported_models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ vLLM supports a variety of generative Transformer models in `HuggingFace Transfo
The following is the list of model architectures that are currently supported by vLLM.
Alongside each architecture, we include some popular models that use it.

----

Decoder-only Language Models
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. list-table::
Expand Down Expand Up @@ -186,6 +188,10 @@ Vision Language Models
- Models
- Example HuggingFace Models
- :ref:`LoRA <lora>`
* - :code:`Blip2ForConditionalGeneration`
- BLIP-2
- :code:`Salesforce/blip2-opt-2.7b`, :code:`Salesforce/blip2-opt-6.7b`, etc.
-
* - :code:`ChameleonForConditionalGeneration`
- Chameleon
- :code:`facebook/chameleon-7b` etc.
Expand Down Expand Up @@ -215,6 +221,8 @@ Vision Language Models
- :code:`openbmb/MiniCPM-V-2`, :code:`openbmb/MiniCPM-Llama3-V-2_5`, etc.
-

----

If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` and :ref:`Enabling Multimodal Inputs <enabling_multimodal_inputs>`
for instructions on how to implement support for your model.
Expand Down
2 changes: 1 addition & 1 deletion docs/source/models/vlm.rst
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptI
generated_text = o.outputs[0].text
print(generated_text)
A code example can be found in `examples/llava_example.py <https://github.com/vllm-project/vllm/blob/main/examples/llava_example.py>`_.
A code example can be found in `examples/offline_inference_vision_language.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language.py>`_.


Online OpenAI Vision API Compatible Inference
Expand Down
5 changes: 4 additions & 1 deletion examples/api_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,10 @@ def post_http_request(prompt: str,
"max_tokens": 16,
"stream": stream,
}
response = requests.post(api_url, headers=headers, json=pload, stream=True)
response = requests.post(api_url,
headers=headers,
json=pload,
stream=stream)
return response


Expand Down
31 changes: 0 additions & 31 deletions examples/fuyu_example.py

This file was deleted.

25 changes: 0 additions & 25 deletions examples/llava_example.py

This file was deleted.

36 changes: 0 additions & 36 deletions examples/llava_next_example.py

This file was deleted.

55 changes: 0 additions & 55 deletions examples/minicpmv_example.py

This file was deleted.

Loading

0 comments on commit 84e56fe

Please sign in to comment.