diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 51618a2955fb1..829414bf8a3ba 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -56,6 +56,11 @@ steps:
env:
DOCKER_BUILDKIT: "1"
+ - input: "Provide Release version here"
+ fields:
+ - text: "What is the release version?"
+ key: "release-version"
+
- block: "Build CPU release image"
key: block-cpu-release-image-build
depends_on: ~
@@ -66,7 +71,7 @@ steps:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION --progress plain -f Dockerfile.cpu ."
- - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION"
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --progress plain -f Dockerfile.cpu ."
+ - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
env:
DOCKER_BUILDKIT: "1"
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 084f865604fff..fd2c18b30407d 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -85,9 +85,22 @@ repos:
entry: tools/png-lint.sh
language: script
types: [png]
+ - id: signoff-commit
+ name: Sign-off Commit
+ entry: bash
+ args:
+ - -c
+ - |
+ if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" .git/COMMIT_EDITMSG; then
+ printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> .git/COMMIT_EDITMSG
+ fi
+ language: system
+ verbose: true
+ stages: [commit-msg]
- id: suggestion
name: Suggestion
entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
language: system
verbose: true
pass_filenames: false
+
diff --git a/csrc/ops.h b/csrc/ops.h
index 830424a4a9d20..f9f0f49faa292 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -156,6 +156,7 @@ torch::Tensor ggml_mul_mat_a8(torch::Tensor W, torch::Tensor X, int64_t type,
#ifndef USE_ROCM
bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);
+bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability);
void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
torch::Tensor const& b, torch::Tensor const& a_scales,
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
index e6f06d72fbfd4..72d549e597df5 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
@@ -58,7 +58,13 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
vllm::cutlass_scaled_mm_blockwise_sm90_fp8(c, a, b, a_scales, b_scales);
} else {
- TORCH_CHECK(false, "Unsupported scale group shapes for CUTLASS 3.x GEMM");
+ TORCH_CHECK(false,
+ "Unsupported scale group shapes for CUTLASS 3.x GEMM.\n "
+ "a_scale_group_shape must be [1, 128], got: [",
+ a_scale_group_shape[0], ", ", a_scale_group_shape[1],
+ "]\n"
+ "b_scale_group_shape must be [128, 128], got: [",
+ b_scale_group_shape[0], ", ", b_scale_group_shape[1], "]");
}
}
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
index da77312bc4b98..6bef55088682a 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -81,6 +81,19 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
return false;
}
+bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability) {
+ // CUTLASS block-quantized FP8 kernels need at least CUDA 12.0
+ // and at least SM90 (Hopper)
+
+#if defined CUDA_VERSION
+ if (cuda_device_capability >= 90) {
+ return CUDA_VERSION >= 12000;
+ }
+#endif
+
+ return false;
+}
+
void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
torch::Tensor const& b, torch::Tensor const& a_scales,
torch::Tensor const& b_scales,
@@ -212,4 +225,4 @@ void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
"No compiled cutlass_scaled_mm_azp for a compute capability less than "
"CUDA device capability: ",
version_num);
-}
\ No newline at end of file
+}
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 4ea67bbac3525..235373240ac36 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -330,6 +330,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops.def("cutlass_scaled_mm_supports_fp8(int cuda_device_capability) -> bool");
ops.impl("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
+ // Check if cutlass scaled_mm supports block quantization (used by DeepSeekV3)
+ ops.def(
+ "cutlass_scaled_mm_supports_block_fp8(int cuda_device_capability) -> "
+ "bool");
+ ops.impl("cutlass_scaled_mm_supports_block_fp8",
+ &cutlass_scaled_mm_supports_fp8);
+
// Check if cutlass sparse scaled_mm is supported for CUDA devices of the
// given capability
ops.def(
diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-1.png b/docs/source/assets/design/v1/prefix_caching/example-time-1.png
new file mode 100644
index 0000000000000..8849ca0237c39
Binary files /dev/null and b/docs/source/assets/design/v1/prefix_caching/example-time-1.png differ
diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-3.png b/docs/source/assets/design/v1/prefix_caching/example-time-3.png
new file mode 100644
index 0000000000000..71b9e9b60ab9a
Binary files /dev/null and b/docs/source/assets/design/v1/prefix_caching/example-time-3.png differ
diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-4.png b/docs/source/assets/design/v1/prefix_caching/example-time-4.png
new file mode 100644
index 0000000000000..017df1657c22e
Binary files /dev/null and b/docs/source/assets/design/v1/prefix_caching/example-time-4.png differ
diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-5.png b/docs/source/assets/design/v1/prefix_caching/example-time-5.png
new file mode 100644
index 0000000000000..b80dd5b9949dc
Binary files /dev/null and b/docs/source/assets/design/v1/prefix_caching/example-time-5.png differ
diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-6.png b/docs/source/assets/design/v1/prefix_caching/example-time-6.png
new file mode 100644
index 0000000000000..fbd7138596e89
Binary files /dev/null and b/docs/source/assets/design/v1/prefix_caching/example-time-6.png differ
diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-7.png b/docs/source/assets/design/v1/prefix_caching/example-time-7.png
new file mode 100644
index 0000000000000..fc33ef50d4fdb
Binary files /dev/null and b/docs/source/assets/design/v1/prefix_caching/example-time-7.png differ
diff --git a/docs/source/assets/design/v1/prefix_caching/free.png b/docs/source/assets/design/v1/prefix_caching/free.png
new file mode 100644
index 0000000000000..cbc2f22222e04
Binary files /dev/null and b/docs/source/assets/design/v1/prefix_caching/free.png differ
diff --git a/docs/source/assets/design/v1/prefix_caching/overview.png b/docs/source/assets/design/v1/prefix_caching/overview.png
new file mode 100644
index 0000000000000..14fb985adca03
Binary files /dev/null and b/docs/source/assets/design/v1/prefix_caching/overview.png differ
diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md
index 908c7cb4d38ee..af09bfecc6499 100644
--- a/docs/source/contributing/overview.md
+++ b/docs/source/contributing/overview.md
@@ -26,7 +26,7 @@ Check out the [building from source](#build-from-source) documentation for detai
pip install -r requirements-dev.txt
# Linting, formatting and static type checking
-pre-commit install
+pre-commit install --hook-type pre-commit --hook-type commit-msg
# You can manually run pre-commit with
pre-commit run --all-files
diff --git a/docs/source/design/v1/prefix_caching.md b/docs/source/design/v1/prefix_caching.md
new file mode 100644
index 0000000000000..dc8432baef9d9
--- /dev/null
+++ b/docs/source/design/v1/prefix_caching.md
@@ -0,0 +1,228 @@
+# Automatic Prefix Caching
+
+Prefix caching kv-cache blocks is a popular optimization in LLM inference to avoid redundant prompt computations. The core idea is simple – we cache the kv-cache blocks of processed requests, and reuse these blocks when a new request comes in with the same prefix as previous requests. Since prefix caching is almost a free lunch and won’t change model outputs, it has been widely used by many public endpoints (e.g., OpenAI, Anthropic, etc) and most open source LLM inference frameworks (e.g., SGLang).
+
+While there are many ways to implement prefix caching, vLLM chooses a hash-based approach. Specifically, we hash each kv-cache block by the tokens in the block and the tokens in the prefix before the block:
+
+```text
+ Block 1 Block 2 Block 3
+ [A gentle breeze stirred] [the leaves as children] [laughed in the distance]
+Block 1: |<--- block tokens ---->|
+Block 2: |<------- prefix ------>| |<--- block tokens --->|
+Block 3: |<------------------ prefix -------------------->| |<--- block tokens ---->|
+```
+
+In the example above, the KV cache in the first block can be uniquely identified with the token “A gentle breeze stirred”. The third block can be uniquely identified with the tokens in the block “laughed in the distance”, along with the prefix tokens “A gentle breeze stirred the leaves as children”. Therefore, we can build the block hash of `hash(tuple[components])`, where components are:
+
+* Parent hash value: The hash value of the parent hash block.
+* Block tokens: A tuple of tokens in this block. The reason to include the exact tokens is to reduce potential hash value collision.
+* Extra hashes: Other values required to make this block unique, such as LoRA IDs and multi-modality input hashes (see the example below).
+
+Note 1: We only cache full blocks.
+
+Note 2: The above hash key structure is not 100% collision free. Theoretically it’s still possible for the different prefix tokens to have the same hash value, but this should be nearly impossible to happen. Of course, contributions are welcome if you have an awesome idea to eliminate collusion entirely.
+
+**A hashing example with multi-modality inputs**
+In this example, we illustrate how prefix caching works with multi-modality inputs (e.g., images). Assuming we have a request with the following messages:
+
+```text
+messages = [
+ {"role": "user",
+ "content": [
+ {"type": "text",
+ "text": "What's in this image?"
+ },
+ {"type": "image_url",
+ "image_url": {"url": image_url},
+ },
+ ]},
+]
+```
+
+It will become the following prompt:
+
+```text
+Prompt:
+ [INST]What's in this image?\n[IMG][/INST]
+
+Tokenized prompt:
+ [1, 3, 7493, 1681, 1294, 1593, 3937, 9551, 10, 4]
+
+Prompt with placeholders (
):
+ [1, 3, 7493, 1681, 1294, 1593, 3937, 9551,
,
, ...,
, 4]
+```
+
+As we can see, after the tokenization, the `[IMG]` will be replaced by a sequence of placeholder tokens, and these placeholders will be replaced by image embeddings during prefill. The challenge for prefix caching to support this case is we need to differentiate images from the placeholders. To address this problem, we encode the image hash generated by the frontend image processor. For example, the hash of the blocks in the above prompt would be (assuming block size 16, and we have 41 placeholder tokens):
+
+```text
+Block 0
+ Parent hash: None
+ Token IDs: 1, 3, 7493, 1681, 1294, 1593, 3937, 9551,
, ...,
+ Extra hash:
+Block 1
+ Parent hash: Block 0 hash
+ Token IDs: , ...,
+ Extra hash:
+Block 2
+ Parent hash: Block 1 hash
+ Token IDs: , ...,
+ Extra hash:
+Block 3
+ Parent hash: Block 2 hash
+ Token IDs: , ...,
, 4
+ Extra hash:
+```
+
+In the rest of this document, we first introduce the data structure used for prefix caching in vLLM v1, followed by the prefix caching workflow of major KV cache operators (e.g., allocate, append, free, eviction). Finally, we use an example to illustrate the end to end prefix caching workflow.
+
+## Data Structure
+
+The prefix caching in vLLM v1 is implemented in the KV cache manager. The basic building block is the “Block” data class (simplified):
+
+```python
+class KVCacheBlock:
+ # The block ID (immutable)
+ block_id: int
+ # The block hash (will be assigned when the block is full,
+ # and will be reset when the block is evicted).
+ block_hash: BlockHashType
+ # The number of requests using this block now.
+ ref_cnt: int
+
+ # The pointers to form a doubly linked list for the free queue.
+ prev_free_block: Optional["KVCacheBlock"] = None
+ next_free_block: Optional["KVCacheBlock"] = None
+```
+
+There are two design points to highlight:
+
+1. We allocate all KVCacheBlock when initializing the KV cache manager to be a block pool. This avoids Python object creation overheads and can easily track all blocks all the time.
+2. We introduce doubly linked list pointers directly in the KVCacheBlock, so that we could construct a free queue directly. This gives us two benefits:
+ 1. We could have O(1) complexity moving elements in the middle to the tail.
+ 2. We could avoid introducing another Python queue (e.g., `deque`) which has a wrapper to the elements.
+
+As a result, we will have the following components when the KV cache manager is initialized:
+
+:::{image} /assets/design/v1/prefix_caching/overview.png
+:alt: Component Overview
+:::
+
+* Block Pool: A list of KVCacheBlock.
+* Free Block Queue: Only store the pointers of head and tail blocks for manipulations.
+* Cache blocks: Mapping from hash key to block IDs.
+* Request blocks: Mapping from request ID to allocated block IDs.
+
+## Operations
+
+### Block Allocation
+
+**New request:** Workflow for the scheduler to schedule a new request with KV cache block allocation:
+
+1. The scheduler calls `kv_cache_manager.get_computed_blocks()` to get a sequence of blocks that have already been computed. This is done by hashing the prompt tokens in the request and looking up Cache Blocks.
+2. The scheduler calls `kv_cache_manager.allocate_slots()`. It does the following steps:
+ 1. Compute the number of new required blocks, and return if there are no sufficient blocks to allocate.
+ 2. “Touch” the computed blocks. It increases the reference count of the computed block by one, and removes the block from the free queue if the block wasn’t used by other requests. This is to avoid these computed blocks being evicted. See the example in the next section for illustration.
+ 3. Allocate new blocks by popping the heads of the free queue. If the head block is a cached block, this also “evicts” the block so that no other requests can reuse it anymore from now on.
+ 4. If an allocated block is already full of tokens, we immediately add it to the Cache Block, so that the block can be reused by other requests in the same batch.
+
+**Running request:** Workflow for the scheduler to schedule a running request with KV cache block allocation:
+
+1. The scheduler calls `kv_cache_manager.append_slots()`. It does the following steps:
+ 1. Compute the number of new required blocks, and return if there are no sufficient blocks to allocate.
+ 2. Allocate new blocks by popping the heads of the free queue. If the head block is a cached block, this also “evicts” the block so that no other requests can reuse it anymore from now on.
+ 3. Append token IDs to the slots in existing blocks as well as the new blocks. If a block is full, we add it to the Cache Block to cache it.
+
+**Duplicated blocks**
+Assuming block size is 4 and you send a request (Request 1\) with prompt ABCDEF and decoding length 3:
+
+```text
+Prompt: [A, B, C, D, E, F]
+Output: [G, H, I]
+
+Time 0:
+ Tokens: [A, B, C, D, E, F, G]
+ Block Table: [0 (ABCD), 1 (EFG)]
+ Cache Blocks: 0
+Time 1:
+ Tokens: [A, B, C, D, E, F, G, H]
+ Block Table: [0 (ABCD), 1 (EFGH)]
+ Cache Blocks: 0, 1
+Time 2:
+ Tokens: [A, B, C, D, E, F, G, H, I]
+ Block Table: [0 (ABCD), 1 (EFGH), 2 (I)]
+ Cache Blocks: 0, 1
+```
+
+Now block 0 and block 1 are cached, and we send the same request again (Request 2\) with greedy sampling, so that it will produce exactly the same outputs as the Request 1:
+
+```text
+Prompt: [A, B, C, D, E, F]
+Output: [G, H, I]
+
+Time 0:
+ Tokens: [A, B, C, D, E, F, G]
+ Block Table: [0 (ABCD), 3 (EFG)]
+ Cache Blocks: 0, 1
+Time 1:
+ Tokens: [A, B, C, D, E, F, G, H]
+ Block Table: [0 (ABCD), 3 (EFGH)]
+ Cache Blocks: 0, 1, 3
+```
+
+As can be seen, block 3 is a new full block and is cached. However, it is redundant as block 1, meaning that we cached the same block twice. In v0, when detecting block 3 is duplicated, we free block 3 and let Request 2 use block 1 instead, so its block table becomes `[0, 1]` in Time 1. However, the block table in vLLM v1 is append-only, meaning that changing the block table from `[0, 3]` to `[0, 1]` is not allowed. As a result, we will have duplicated blocks for the hash key E-H. This duplication will be eliminated when the request is freed.
+
+### Free
+
+When a request is finished, we free all its blocks if no other requests are using them (reference count = 0). In this example, we free request 1 and block 2, 3, 4, 8 associated with it. We can see that the freed blocks are added to the tail of the free queue in the *reverse* order. This is because the last block of a request must hash more tokens and is less likely to be reused by other requests. As a result, it should be evicted first.
+
+:::{image} /assets/design/v1/prefix_caching/free.png
+:alt: Free Queue after Free a Request
+:::
+
+### Eviction (LRU)
+
+When the head block (least recently used block) of the free queue is cached, we have to evict the block to prevent it from being used by other requests. Specifically, eviction involves the following steps:
+
+1. Pop the block from the head of the free queue. This is the LRU black to be evicted.
+2. Remove the block ID from the Cache Block.
+3. Remove the block hash.
+
+## Example
+
+In this example, we assume the block size is 4 (each block can cache 4 tokens), and we have 10 blocks in the KV-cache manager in total.
+
+**Time 1: The cache is empty and a new request comes in.** We allocate 4 blocks. 3 of them are already full and cached. The fourth block is partially full with 2 of 4 tokens.
+
+:::{image} /assets/design/v1/prefix_caching/example-time-1.png
+:alt: Example Time 1
+:::
+
+**Time 3: Request 0 makes the block 3 full and asks for a new block to keep decoding.** We cache block 3 and allocate block 4.
+
+:::{image} /assets/design/v1/prefix_caching/example-time-3.png
+:alt: Example Time 3
+:::
+
+**Time 4: Request 1 comes in with the 14 prompt tokens, where the first 11 tokens are the same as request 0.** We can see that only 2 blocks (11 tokens) hit the cache, because the 3rd block only matches 3 of 4 tokens.
+
+:::{image} /assets/design/v1/prefix_caching/example-time-4.png
+:alt: Example Time 4
+:::
+
+**Time 5: Request 0 is finished and free.** Blocks 2, 3 and 4 are added to the free queue in the reverse order (but block 2 and 3 are still cached). Block 0 and 1 are not added to the free queue because they are being used by Request 1.
+
+:::{image} /assets/design/v1/prefix_caching/example-time-5.png
+:alt: Example Time 5
+:::
+
+**Time 6: Request 1 is finished and free.**
+
+:::{image} /assets/design/v1/prefix_caching/example-time-6.png
+:alt: Example Time 6
+:::
+
+**Time 7: Request 2 comes in with the 33 prompt tokens, where the first 16 tokens are the same as request 0\.** Note that even the block order in the free queue was `7 - 8 - 9 - 4 - 3 - 2 - 6 - 5 - 1 - 0`, the cache hit blocks (i.e., 0, 1, 2) are touched and removed from the queue before allocation, so the free queue becomes `7 - 8 - 9 - 4 - 3 - 6 - 5`. As a result, the allocated blocks are 0 (cached), 1 (cached), 2 (cached), 7, 8, 9, 4, 3 (evicted).
+
+:::{image} /assets/design/v1/prefix_caching/example-time-7.png
+:alt: Example Time 7
+:::
diff --git a/docs/source/features/quantization/index.md b/docs/source/features/quantization/index.md
index d972dc85fc23c..1c98620aa2145 100644
--- a/docs/source/features/quantization/index.md
+++ b/docs/source/features/quantization/index.md
@@ -12,6 +12,7 @@ supported_hardware
auto_awq
bnb
gguf
+int4
int8
fp8
quantized_kvcache
diff --git a/docs/source/features/quantization/int4.md b/docs/source/features/quantization/int4.md
new file mode 100644
index 0000000000000..f8939e5bf0150
--- /dev/null
+++ b/docs/source/features/quantization/int4.md
@@ -0,0 +1,166 @@
+(int4)=
+
+# INT4 W4A16
+
+vLLM supports quantizing weights to INT4 for memory savings and inference acceleration. This quantization method is particularly useful for reducing model size and maintaining low latency in workloads with low queries per second (QPS).
+
+Please visit the HF collection of [quantized INT4 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/int4-llms-for-vllm-668ec34bf3c9fa45f857df2c).
+
+:::{note}
+INT4 computation is supported on NVIDIA GPUs with compute capability > 8.0 (Ampere, Ada Lovelace, Hopper, Blackwell).
+:::
+
+## Prerequisites
+
+To use INT4 quantization with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library:
+
+```console
+pip install llmcompressor
+```
+
+## Quantization Process
+
+The quantization process involves four main steps:
+
+1. Loading the model
+2. Preparing calibration data
+3. Applying quantization
+4. Evaluating accuracy in vLLM
+
+### 1. Loading the Model
+
+Load your model and tokenizer using the standard `transformers` AutoModel classes:
+
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+model = AutoModelForCausalLM.from_pretrained(
+ MODEL_ID, device_map="auto", torch_dtype="auto",
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+```
+
+### 2. Preparing Calibration Data
+
+When quantizing weights to INT4, you need sample data to estimate the weight updates and calibrated scales.
+It's best to use calibration data that closely matches your deployment data.
+For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:
+
+```python
+from datasets import load_dataset
+
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+# Load and preprocess the dataset
+ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
+ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+
+def preprocess(example):
+ return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
+ds = ds.map(preprocess)
+
+def tokenize(sample):
+ return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+```
+
+### 3. Applying Quantization
+
+Now, apply the quantization algorithms:
+
+```python
+from llmcompressor.transformers import oneshot
+from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
+
+# Configure the quantization algorithms
+recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
+
+# Apply quantization
+oneshot(
+ model=model,
+ dataset=ds,
+ recipe=recipe,
+ max_seq_length=MAX_SEQUENCE_LENGTH,
+ num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+)
+
+# Save the compressed model
+SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
+```
+
+This process creates a W4A16 model with weights quantized to 4-bit integers.
+
+### 4. Evaluating Accuracy
+
+After quantization, you can load and run the model in vLLM:
+
+```python
+from vllm import LLM
+model = LLM("./Meta-Llama-3-8B-Instruct-W4A16-G128")
+```
+
+To evaluate accuracy, you can use `lm_eval`:
+
+```console
+$ lm_eval --model vllm \
+ --model_args pretrained="./Meta-Llama-3-8B-Instruct-W4A16-G128",add_bos_token=true \
+ --tasks gsm8k \
+ --num_fewshot 5 \
+ --limit 250 \
+ --batch_size 'auto'
+```
+
+:::{note}
+Quantized models can be sensitive to the presence of the `bos` token. Make sure to include the `add_bos_token=True` argument when running evaluations.
+:::
+
+## Best Practices
+
+- Start with 512 samples for calibration data, and increase if accuracy drops
+- Ensure the calibration data contains a high variety of samples to prevent overfitting towards a specific use case
+- Use a sequence length of 2048 as a starting point
+- Employ the chat template or instruction template that the model was trained with
+- If you've fine-tuned a model, consider using a sample of your training data for calibration
+- Tune key hyperparameters to the quantization algorithm:
+ - `dampening_frac` sets how much influence the GPTQ algorithm has. Lower values can improve accuracy, but can lead to numerical instabilities that cause the algorithm to fail.
+ - `actorder` sets the activation ordering. When compressing the weights of a layer weight, the order in which channels are quantized matters. Setting `actorder="weight"` can improve accuracy without added latency.
+
+The following is an example of an expanded quantization recipe you can tune to your own use case:
+
+```python
+from compressed_tensors.quantization import (
+ QuantizationArgs,
+ QuantizationScheme,
+ QuantizationStrategy,
+ QuantizationType,
+)
+recipe = GPTQModifier(
+ targets="Linear",
+ config_groups={
+ "config_group": QuantizationScheme(
+ targets=["Linear"],
+ weights=QuantizationArgs(
+ num_bits=4,
+ type=QuantizationType.INT,
+ strategy=QuantizationStrategy.GROUP,
+ group_size=128,
+ symmetric=True,
+ dynamic=False,
+ actorder="weight",
+ ),
+ ),
+ },
+ ignore=["lm_head"],
+ update_size=NUM_CALIBRATION_SAMPLES,
+ dampening_frac=0.01
+)
+```
+
+## Troubleshooting and Support
+
+If you encounter any issues or have feature requests, please open an issue on the [`vllm-project/llm-compressor`](https://github.com/vllm-project/llm-compressor) GitHub repository. The full INT4 quantization example in `llm-compressor` is available [here](https://github.com/vllm-project/llm-compressor/blob/main/examples/quantization_w4a16/llama3_example.py).
diff --git a/docs/source/features/quantization/int8.md b/docs/source/features/quantization/int8.md
index fedb16f4350e5..b381f34bccd34 100644
--- a/docs/source/features/quantization/int8.md
+++ b/docs/source/features/quantization/int8.md
@@ -8,7 +8,7 @@ This quantization method is particularly useful for reducing model size while ma
Please visit the HF collection of [quantized INT8 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/int8-llms-for-vllm-668ec32c049dca0369816415).
:::{note}
-INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper).
+INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper, Blackwell).
:::
## Prerequisites
@@ -132,4 +132,4 @@ Quantized models can be sensitive to the presence of the `bos` token. Make sure
## Troubleshooting and Support
-If you encounter any issues or have feature requests, please open an issue on the `vllm-project/llm-compressor` GitHub repository.
+If you encounter any issues or have feature requests, please open an issue on the [`vllm-project/llm-compressor`](https://github.com/vllm-project/llm-compressor) GitHub repository.
diff --git a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
index 704a16233981f..f3b0d6dc9bdc8 100644
--- a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
@@ -2,6 +2,10 @@
This tab provides instructions on running vLLM with Intel Gaudi devices.
+:::{attention}
+There are no pre-built wheels or images for this device, so you must build vLLM from source.
+:::
+
## Requirements
- OS: Ubuntu 22.04 LTS
diff --git a/docs/source/getting_started/installation/ai_accelerator/index.md b/docs/source/getting_started/installation/ai_accelerator/index.md
index 88352f639567b..01793572fee7c 100644
--- a/docs/source/getting_started/installation/ai_accelerator/index.md
+++ b/docs/source/getting_started/installation/ai_accelerator/index.md
@@ -5,7 +5,8 @@ vLLM is a Python library that supports the following AI accelerators. Select you
:::::{tab-set}
:sync-group: device
-::::{tab-item} TPU
+::::{tab-item} Google TPU
+:selected:
:sync: tpu
:::{include} tpu.inc.md
@@ -25,7 +26,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
::::
-::::{tab-item} Neuron
+::::{tab-item} AWS Neuron
:sync: neuron
:::{include} neuron.inc.md
@@ -52,7 +53,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
:::::{tab-set}
:sync-group: device
-::::{tab-item} TPU
+::::{tab-item} Google TPU
:sync: tpu
:::{include} tpu.inc.md
@@ -72,7 +73,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
::::
-::::{tab-item} Neuron
+::::{tab-item} AWS Neuron
:sync: neuron
:::{include} neuron.inc.md
@@ -99,7 +100,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
:::::{tab-set}
:sync-group: device
-::::{tab-item} TPU
+::::{tab-item} Google TPU
:sync: tpu
:::{include} tpu.inc.md
@@ -119,7 +120,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
::::
-::::{tab-item} Neuron
+::::{tab-item} AWS Neuron
:sync: neuron
:::{include} neuron.inc.md
@@ -146,7 +147,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
:::::{tab-set}
:sync-group: device
-::::{tab-item} TPU
+::::{tab-item} Google TPU
:sync: tpu
:::{include} tpu.inc.md
@@ -166,7 +167,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
::::
-::::{tab-item} Neuron
+::::{tab-item} AWS Neuron
:sync: neuron
:::{include} neuron.inc.md
@@ -193,7 +194,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
:::::{tab-set}
:sync-group: device
-::::{tab-item} TPU
+::::{tab-item} Google TPU
:sync: tpu
:::{include} tpu.inc.md
@@ -213,7 +214,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
::::
-::::{tab-item} Neuron
+::::{tab-item} AWS Neuron
:sync: neuron
:::{include} neuron.inc.md
@@ -242,7 +243,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
:::::{tab-set}
:sync-group: device
-::::{tab-item} TPU
+::::{tab-item} Google TPU
:sync: tpu
:::{include} tpu.inc.md
@@ -262,7 +263,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
::::
-::::{tab-item} Neuron
+::::{tab-item} AWS Neuron
:sync: neuron
:::{include} neuron.inc.md
@@ -289,7 +290,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
:::::{tab-set}
:sync-group: device
-::::{tab-item} TPU
+::::{tab-item} Google TPU
:sync: tpu
:::{include} tpu.inc.md
@@ -309,7 +310,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
::::
-::::{tab-item} Neuron
+::::{tab-item} AWS Neuron
:sync: neuron
:::{include} neuron.inc.md
@@ -336,7 +337,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
:::::{tab-set}
:sync-group: device
-::::{tab-item} TPU
+::::{tab-item} Google TPU
:sync: tpu
:::{include} tpu.inc.md
@@ -354,7 +355,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
::::
-::::{tab-item} Neuron
+::::{tab-item} AWS Neuron
:sync: neuron
:::{include} neuron.inc.md
diff --git a/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
index 145cc9d668efd..f149818acafb8 100644
--- a/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
@@ -4,6 +4,10 @@ vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Infere
Paged Attention and Chunked Prefill are currently in development and will be available soon.
Data types currently supported in Neuron SDK are FP16 and BF16.
+:::{attention}
+There are no pre-built wheels or images for this device, so you must build vLLM from source.
+:::
+
## Requirements
- OS: Linux
diff --git a/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md b/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
index a7867472583d6..112e8d4d9b256 100644
--- a/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
@@ -2,6 +2,10 @@
vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](#supported-models) and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)).
+:::{attention}
+There are no pre-built wheels or images for this device, so you must build vLLM from source.
+:::
+
## Requirements
- OS: Linux
diff --git a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
index 6827afc805fd8..c0d50feafce56 100644
--- a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
@@ -30,6 +30,10 @@ For TPU pricing information, see [Cloud TPU pricing](https://cloud.google.com/tp
You may need additional persistent storage for your TPU VMs. For more
information, see [Storage options for Cloud TPU data](https://cloud.devsite.corp.google.com/tpu/docs/storage-options).
+:::{attention}
+There are no pre-built wheels for this device, so you must either use the pre-built Docker image or build vLLM from source.
+:::
+
## Requirements
- Google Cloud TPU VM
diff --git a/docs/source/getting_started/installation/cpu/apple.inc.md b/docs/source/getting_started/installation/cpu/apple.inc.md
index 0808b869fdb7b..3bf1d47fa0ff9 100644
--- a/docs/source/getting_started/installation/cpu/apple.inc.md
+++ b/docs/source/getting_started/installation/cpu/apple.inc.md
@@ -4,6 +4,10 @@ vLLM has experimental support for macOS with Apple silicon. For now, users shall
Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
+:::{attention}
+There are no pre-built wheels or images for this device, so you must build vLLM from source.
+:::
+
## Requirements
- OS: `macOS Sonoma` or later
diff --git a/docs/source/getting_started/installation/cpu/arm.inc.md b/docs/source/getting_started/installation/cpu/arm.inc.md
index 08a764e1a25f4..a661a0ca5adc7 100644
--- a/docs/source/getting_started/installation/cpu/arm.inc.md
+++ b/docs/source/getting_started/installation/cpu/arm.inc.md
@@ -4,6 +4,10 @@ vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CP
ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
+:::{attention}
+There are no pre-built wheels or images for this device, so you must build vLLM from source.
+:::
+
## Requirements
- OS: Linux
diff --git a/docs/source/getting_started/installation/cpu/index.md b/docs/source/getting_started/installation/cpu/index.md
index 2f549ede0cf48..d53430403583c 100644
--- a/docs/source/getting_started/installation/cpu/index.md
+++ b/docs/source/getting_started/installation/cpu/index.md
@@ -5,7 +5,8 @@ vLLM is a Python library that supports the following CPU variants. Select your C
:::::{tab-set}
:sync-group: device
-::::{tab-item} x86
+::::{tab-item} Intel/AMD x86
+:selected:
:sync: x86
:::{include} x86.inc.md
@@ -15,7 +16,7 @@ vLLM is a Python library that supports the following CPU variants. Select your C
::::
-::::{tab-item} ARM
+::::{tab-item} ARM AArch64
:sync: arm
:::{include} arm.inc.md
@@ -44,7 +45,7 @@ vLLM is a Python library that supports the following CPU variants. Select your C
:::::{tab-set}
:sync-group: device
-::::{tab-item} x86
+::::{tab-item} Intel/AMD x86
:sync: x86
:::{include} x86.inc.md
@@ -54,7 +55,7 @@ vLLM is a Python library that supports the following CPU variants. Select your C
::::
-::::{tab-item} ARM
+::::{tab-item} ARM AArch64
:sync: arm
:::{include} arm.inc.md
@@ -92,7 +93,7 @@ Currently, there are no pre-built CPU wheels.
:::::{tab-set}
:sync-group: device
-::::{tab-item} x86
+::::{tab-item} Intel/AMD x86
:sync: x86
:::{include} x86.inc.md
@@ -102,7 +103,7 @@ Currently, there are no pre-built CPU wheels.
::::
-::::{tab-item} ARM
+::::{tab-item} ARM AArch64
:sync: arm
:::{include} arm.inc.md
diff --git a/docs/source/getting_started/installation/cpu/x86.inc.md b/docs/source/getting_started/installation/cpu/x86.inc.md
index f146ae0918b44..1dafc3660060e 100644
--- a/docs/source/getting_started/installation/cpu/x86.inc.md
+++ b/docs/source/getting_started/installation/cpu/x86.inc.md
@@ -2,12 +2,20 @@
vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16.
+:::{attention}
+There are no pre-built wheels or images for this device, so you must build vLLM from source.
+:::
+
## Requirements
- OS: Linux
- Compiler: `gcc/g++ >= 12.3.0` (optional, recommended)
- Instruction Set Architecture (ISA): AVX512 (optional, recommended)
+:::{tip}
+[Intel Extension for PyTorch (IPEX)](https://github.com/intel/intel-extension-for-pytorch) extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware.
+:::
+
## Set up using Python
### Pre-built wheels
@@ -29,7 +37,3 @@ vLLM initially supports basic model inferencing and serving on x86 CPU platform,
### Build image from source
## Extra information
-
-## Intel Extension for PyTorch
-
-- [Intel Extension for PyTorch (IPEX)](https://github.com/intel/intel-extension-for-pytorch) extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware.
diff --git a/docs/source/getting_started/installation/gpu/index.md b/docs/source/getting_started/installation/gpu/index.md
index 0a61f889753a3..f82c4bda28620 100644
--- a/docs/source/getting_started/installation/gpu/index.md
+++ b/docs/source/getting_started/installation/gpu/index.md
@@ -5,7 +5,8 @@ vLLM is a Python library that supports the following GPU variants. Select your G
:::::{tab-set}
:sync-group: device
-::::{tab-item} CUDA
+::::{tab-item} NVIDIA CUDA
+:selected:
:sync: cuda
:::{include} cuda.inc.md
@@ -15,7 +16,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
::::
-::::{tab-item} ROCm
+::::{tab-item} AMD ROCm
:sync: rocm
:::{include} rocm.inc.md
@@ -25,7 +26,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
::::
-::::{tab-item} XPU
+::::{tab-item} Intel XPU
:sync: xpu
:::{include} xpu.inc.md
@@ -45,7 +46,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
:::::{tab-set}
:sync-group: device
-::::{tab-item} CUDA
+::::{tab-item} NVIDIA CUDA
:sync: cuda
:::{include} cuda.inc.md
@@ -55,7 +56,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
::::
-::::{tab-item} ROCm
+::::{tab-item} AMD ROCm
:sync: rocm
:::{include} rocm.inc.md
@@ -65,7 +66,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
::::
-::::{tab-item} XPU
+::::{tab-item} Intel XPU
:sync: xpu
:::{include} xpu.inc.md
@@ -87,7 +88,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
:::::{tab-set}
:sync-group: device
-::::{tab-item} CUDA
+::::{tab-item} NVIDIA CUDA
:sync: cuda
:::{include} cuda.inc.md
@@ -97,14 +98,14 @@ vLLM is a Python library that supports the following GPU variants. Select your G
::::
-::::{tab-item} ROCm
+::::{tab-item} AMD ROCm
:sync: rocm
There is no extra information on creating a new Python environment for this device.
::::
-::::{tab-item} XPU
+::::{tab-item} Intel XPU
:sync: xpu
There is no extra information on creating a new Python environment for this device.
@@ -118,7 +119,7 @@ There is no extra information on creating a new Python environment for this devi
:::::{tab-set}
:sync-group: device
-::::{tab-item} CUDA
+::::{tab-item} NVIDIA CUDA
:sync: cuda
:::{include} cuda.inc.md
@@ -128,7 +129,7 @@ There is no extra information on creating a new Python environment for this devi
::::
-::::{tab-item} ROCm
+::::{tab-item} AMD ROCm
:sync: rocm
:::{include} rocm.inc.md
@@ -138,7 +139,7 @@ There is no extra information on creating a new Python environment for this devi
::::
-::::{tab-item} XPU
+::::{tab-item} Intel XPU
:sync: xpu
:::{include} xpu.inc.md
@@ -157,7 +158,7 @@ There is no extra information on creating a new Python environment for this devi
:::::{tab-set}
:sync-group: device
-::::{tab-item} CUDA
+::::{tab-item} NVIDIA CUDA
:sync: cuda
:::{include} cuda.inc.md
@@ -167,7 +168,7 @@ There is no extra information on creating a new Python environment for this devi
::::
-::::{tab-item} ROCm
+::::{tab-item} AMD ROCm
:sync: rocm
:::{include} rocm.inc.md
@@ -177,7 +178,7 @@ There is no extra information on creating a new Python environment for this devi
::::
-::::{tab-item} XPU
+::::{tab-item} Intel XPU
:sync: xpu
:::{include} xpu.inc.md
@@ -196,7 +197,7 @@ There is no extra information on creating a new Python environment for this devi
:::::{tab-set}
:sync-group: device
-::::{tab-item} CUDA
+::::{tab-item} NVIDIA CUDA
:sync: cuda
:::{include} cuda.inc.md
@@ -206,7 +207,7 @@ There is no extra information on creating a new Python environment for this devi
::::
-::::{tab-item} ROCm
+::::{tab-item} AMD ROCm
:sync: rocm
:::{include} rocm.inc.md
@@ -216,7 +217,7 @@ There is no extra information on creating a new Python environment for this devi
::::
-::::{tab-item} XPU
+::::{tab-item} Intel XPU
:sync: xpu
:::{include} xpu.inc.md
@@ -233,7 +234,7 @@ There is no extra information on creating a new Python environment for this devi
:::::{tab-set}
:sync-group: device
-::::{tab-item} CUDA
+::::{tab-item} NVIDIA CUDA
:sync: cuda
:::{include} cuda.inc.md
@@ -243,7 +244,7 @@ There is no extra information on creating a new Python environment for this devi
::::
-::::{tab-item} ROCm
+::::{tab-item} AMD ROCm
:sync: rocm
:::{include} rocm.inc.md
@@ -253,7 +254,7 @@ There is no extra information on creating a new Python environment for this devi
::::
-::::{tab-item} XPU
+::::{tab-item} Intel XPU
:sync: xpu
:::{include} xpu.inc.md
@@ -270,7 +271,7 @@ There is no extra information on creating a new Python environment for this devi
:::::{tab-set}
:sync-group: device
-::::{tab-item} CUDA
+::::{tab-item} NVIDIA CUDA
:sync: cuda
:::{include} cuda.inc.md
@@ -279,7 +280,7 @@ There is no extra information on creating a new Python environment for this devi
::::
-::::{tab-item} ROCm
+::::{tab-item} AMD ROCm
:sync: rocm
:::{include} rocm.inc.md
@@ -288,7 +289,7 @@ There is no extra information on creating a new Python environment for this devi
::::
-::::{tab-item} XPU
+::::{tab-item} Intel XPU
:sync: xpu
:::{include} xpu.inc.md
diff --git a/docs/source/getting_started/installation/gpu/rocm.inc.md b/docs/source/getting_started/installation/gpu/rocm.inc.md
index 131ad1704ea11..c8fd11415cfda 100644
--- a/docs/source/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/source/getting_started/installation/gpu/rocm.inc.md
@@ -2,6 +2,10 @@
vLLM supports AMD GPUs with ROCm 6.2.
+:::{attention}
+There are no pre-built wheels for this device, so you must either use the pre-built Docker image or build vLLM from source.
+:::
+
## Requirements
- GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100)
@@ -13,14 +17,6 @@ vLLM supports AMD GPUs with ROCm 6.2.
Currently, there are no pre-built ROCm wheels.
-However, the [AMD Infinity hub for vLLM](https://hub.docker.com/r/rocm/vllm/tags) offers a prebuilt, optimized
-docker image designed for validating inference performance on the AMD Instinct™ MI300X accelerator.
-
-:::{tip}
-Please check [LLM inference performance validation on AMD Instinct MI300X](https://rocm.docs.amd.com/en/latest/how-to/performance-validation/mi300x/vllm-benchmark.html)
-for instructions on how to use this prebuilt docker image.
-:::
-
### Build wheel from source
0. Install prerequisites (skip if you are already in an environment/docker with the following installed):
@@ -112,7 +108,13 @@ for instructions on how to use this prebuilt docker image.
### Pre-built images
-Currently, there are no pre-built ROCm images.
+The [AMD Infinity hub for vLLM](https://hub.docker.com/r/rocm/vllm/tags) offers a prebuilt, optimized
+docker image designed for validating inference performance on the AMD Instinct™ MI300X accelerator.
+
+:::{tip}
+Please check [LLM inference performance validation on AMD Instinct MI300X](https://rocm.docs.amd.com/en/latest/how-to/performance-validation/mi300x/vllm-benchmark.html)
+for instructions on how to use this prebuilt docker image.
+:::
### Build image from source
diff --git a/docs/source/getting_started/installation/gpu/xpu.inc.md b/docs/source/getting_started/installation/gpu/xpu.inc.md
index bc01c6000bc07..4116826789e5c 100644
--- a/docs/source/getting_started/installation/gpu/xpu.inc.md
+++ b/docs/source/getting_started/installation/gpu/xpu.inc.md
@@ -2,6 +2,10 @@
vLLM initially supports basic model inferencing and serving on Intel GPU platform.
+:::{attention}
+There are no pre-built wheels or images for this device, so you must build vLLM from source.
+:::
+
## Requirements
- Supported Hardware: Intel Data Center GPU, Intel ARC GPU
diff --git a/docs/source/getting_started/installation/index.md b/docs/source/getting_started/installation/index.md
index 0f5e013ce071a..c64c3a7208eeb 100644
--- a/docs/source/getting_started/installation/index.md
+++ b/docs/source/getting_started/installation/index.md
@@ -6,8 +6,23 @@ vLLM supports the following hardware platforms:
:::{toctree}
:maxdepth: 1
+:hidden:
gpu/index
cpu/index
ai_accelerator/index
:::
+
+-
+ - NVIDIA CUDA
+ - AMD ROCm
+ - Intel XPU
+-
+ - Intel/AMD x86
+ - ARM AArch64
+ - Apple silicon
+-
+ - Google TPU
+ - Intel Gaudi
+ - AWS Neuron
+ - OpenVINO
diff --git a/docs/source/index.md b/docs/source/index.md
index e90e81c72860a..ee25678e2c418 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -153,6 +153,13 @@ design/automatic_prefix_caching
design/multiprocessing
:::
+:::{toctree}
+:caption: V1 Design Documents
+:maxdepth: 2
+
+design/v1/prefix_caching
+:::
+
% How to contribute to the vLLM project
:::{toctree}
diff --git a/format.sh b/format.sh
index 4bcd0be0c96e5..3e78bf9865f0d 100755
--- a/format.sh
+++ b/format.sh
@@ -1,5 +1,6 @@
#!/bin/bash
echo "vLLM linting system has been moved from format.sh to pre-commit hook."
-echo "Please run 'pip install -r requirements-lint.txt' and 'pre-commit install' to install the pre-commit hook."
+echo "Please run 'pip install -r requirements-lint.txt', followed by"
+echo "'pre-commit install --hook-type pre-commit --hook-type commit-msg' to install the pre-commit hook."
echo "Then linters will run automatically before each commit."
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index f4081766e39a2..0a5ba1f98221f 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -192,7 +192,7 @@ def test_hash_block_tokens():
extra_keys)
assert isinstance(block_hash, BlockHashType)
assert block_hash.hash_value == hash(
- (parent_block_hash, *curr_block_token_ids))
+ (parent_block_hash, curr_block_token_ids, extra_keys))
assert block_hash.token_ids == curr_block_token_ids
assert block_hash.extra_keys == extra_keys
@@ -227,6 +227,38 @@ def test_hash_request_tokens():
assert block_hashes[1].extra_keys == ("hash2", )
+def test_hash_tokens_different_mm_input():
+ request1 = make_request(
+ request_id=0,
+ prompt_token_ids=[_ for _ in range(6)],
+ mm_positions=[{
+ "offset": 0,
+ "length": 3
+ }, {
+ "offset": 3,
+ "length": 3
+ }],
+ mm_hashes=["hash1", "hash2"],
+ )
+ request2 = make_request(
+ request_id=1,
+ prompt_token_ids=[_ for _ in range(6)],
+ mm_positions=[{
+ "offset": 0,
+ "length": 3
+ }, {
+ "offset": 3,
+ "length": 3
+ }],
+ mm_hashes=["hash3", "hash2"],
+ )
+ block_size = 3
+ block_hashes1 = hash_request_tokens(block_size, request1)
+ block_hashes2 = hash_request_tokens(block_size, request2)
+ assert block_hashes1[0] != block_hashes2[0]
+ assert block_hashes1[1] != block_hashes2[1]
+
+
def test_hash_request_tokens_no_mm_inputs():
request = make_request(
request_id=0,
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index aa451352f979f..18bedbf728599 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -456,6 +456,11 @@ def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
return torch.ops._C.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
+def cutlass_scaled_mm_supports_block_fp8(cuda_device_capability: int) -> bool:
+ return torch.ops._C.cutlass_scaled_mm_supports_block_fp8(
+ cuda_device_capability)
+
+
def cutlass_scaled_mm(a: torch.Tensor,
b: torch.Tensor,
scale_a: torch.Tensor,
diff --git a/vllm/attention/backends/mla/utils.py b/vllm/attention/backends/mla/utils.py
index c6c8a6034e20f..e8fec234c0225 100644
--- a/vllm/attention/backends/mla/utils.py
+++ b/vllm/attention/backends/mla/utils.py
@@ -1,17 +1,29 @@
from abc import abstractmethod
from dataclasses import dataclass
-from typing import Any, Dict, Generic, List, Optional
+from typing import Any, Dict, Generic, List, Optional, Tuple
import torch
+from compressed_tensors.quantization import QuantizationStrategy
from vllm import _custom_ops as ops
from vllm import envs
from vllm.attention.backends.abstract import (AttentionLayer,
AttentionMetadata,
MLAAttentionImpl, T)
-from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed import (get_tensor_model_parallel_world_size,
+ tensor_model_parallel_all_reduce)
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
- RowParallelLinear)
+ LinearBase, RowParallelLinear,
+ UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501
+ CompressedTensorsLinearMethod)
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+ CompressedTensorsW8A8Fp8)
+from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+ apply_fp8_linear_generic, current_platform_fp8_dtype, is_fp8)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+ scaled_dequantize, scaled_quantize)
from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
from vllm.vllm_flash_attn import flash_attn_varlen_func
@@ -25,11 +37,11 @@ class MLACommonMetadata(AttentionMetadata):
class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
"""
- Common class for implementing repeated parts
-
+ Common class for implementing repeated parts
+
Main reference: DeepseekV2 paper, and FlashInfer Implementation
(https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551).
-
+
Deepseek's MLA attention works the following way:
* Use a single latent vector to represent the entire KV cache.
* The attention "simulates" a multi-head attention, while the compute is
@@ -46,7 +58,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
* V: V head dim.
* kv_c: latent/compressed KV
* q_c: latent/compressed Q
-
+
#
# Outside the MLA attention backend
#
@@ -55,21 +67,21 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
kv_c_k_pe (B, Lkv+R).
2. The kv_c_k_pe is split into kv_c (B, Lkv) and k_pe (B, R). cq
and kv_c are normalized.
-
+
#
# Inside the MLA attention backend
#
* if prefill:
-
- 3. The q_c is then projected up into the multi-head version.
- * q_c goes from (B, Lq) to (B, N, (P+R)), which is split into q_nope
- (B, N, P) and q_pe (B, N, R).
+
+ 3. The q_c is then projected up into the multi-head version.
+ * q_c goes from (B, Lq) to (B, N, (P+R)), which is split into q_nope
+ (B, N, P) and q_pe (B, N, R).
4. q_pe, k_pe are then passed through rotary embeddings.
5. kv_c and k_pe are concatenated and inserted into the cache
- 6. The kv_c is then projected up into the multi-head version.
- * kv_c goes from (B, Lkv) to (B, N, (P+V)) which has the nope
- dimensions for K and V, which is split into k_nope (B, N, P)
+ 6. The kv_c is then projected up into the multi-head version.
+ * kv_c goes from (B, Lkv) to (B, N, (P+V)) which has the nope
+ dimensions for K and V, which is split into k_nope (B, N, P)
and v (B, N, V).
7. q (B, N, (P+R)) and k (B, N, (P+R)) matrices are assembled from
q_nope, q_pe, k_nope, k_pe.
@@ -112,7 +124,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
From @tsu-bin's calculation, we only want to use the absorption technique
for decode. The prefill algorithm should still use the up-projected MHA
for less flops and memory usage.
-
+
"""
def __init__(
@@ -162,8 +174,19 @@ def __init__(
def _v_up_proj_and_o_proj(self, x):
if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
- return self.o_proj_absorbed(
- x.reshape(-1, self.num_heads * self.kv_lora_rank))[0]
+ if is_fp8(self.W_UV_O):
+ output_parallel = apply_fp8_linear_generic(
+ x.flatten(start_dim=1), self.W_UV_O, self.W_UV_O_scales,
+ self.reqaunt_input_group_shape,
+ self.reqaunt_weight_group_shape)
+ else:
+ output_parallel = torch.matmul(x.flatten(start_dim=1),
+ self.W_UV_O)
+ if self.tp_size > 1:
+ output = tensor_model_parallel_all_reduce(output_parallel)
+ else:
+ output = output_parallel
+ return output
else:
x = torch.einsum("bnl,lnv->bnv", x, self.W_UV)
return self.o_proj(x.reshape(-1,
@@ -171,6 +194,12 @@ def _v_up_proj_and_o_proj(self, x):
def _q_proj_and_k_up_proj(self, x):
if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
+ if is_fp8(self.W_Q_UK):
+ return apply_fp8_linear_generic(
+ x, self.W_Q_UK, self.W_Q_UK_scales,
+ self.reqaunt_input_group_shape,
+ self.reqaunt_weight_group_shape).view(
+ -1, self.num_heads, self.kv_lora_rank)
return torch.matmul(x, self.W_Q_UK)\
.view(-1, self.num_heads, self.kv_lora_rank)
else:
@@ -179,8 +208,91 @@ def _q_proj_and_k_up_proj(self, x):
return torch.einsum("bnp,lnp->bnl", x, self.W_UK)\
.view(-1, self.num_heads, self.kv_lora_rank)
- def process_weights_after_loading(self):
- kv_b_proj_weight = self.kv_b_proj.weight.T
+ def process_weights_after_loading(self, act_dtype: torch.dtype):
+
+ def is_layer_fp8(layer: LinearBase) -> bool:
+ return isinstance(layer.quant_method, Fp8LinearMethod) or\
+ (isinstance(layer.quant_method, CompressedTensorsLinearMethod)\
+ and isinstance(layer.scheme, CompressedTensorsW8A8Fp8))
+
+ def quantization_scheme_supported(layer: LinearBase) -> bool:
+ return isinstance(layer.quant_method, UnquantizedLinearMethod) or \
+ is_layer_fp8(layer)
+
+ # TODO(lucas) This is very gross, we need a more wide scale refactor of
+ # all the FP8 code with a more standard way of
+ # defining schemes/group-shapes, we should also potentially force
+ # quant_methods to support a decompress function
+ #
+ # returns input_group_shape, weight_group_shape
+ def get_scale_group_shapes_for_fp8(layer: LinearBase) -> \
+ Tuple[Tuple[int, int], Tuple[int, int]]:
+ if isinstance(layer.quant_method, Fp8LinearMethod):
+ if layer.quant_method.block_quant is not None:
+ weight_block_size = \
+ layer.quant_method.quant_config.weight_block_size
+ # per-token-group (1, X), block-quantized (X, Y)
+ return (1, weight_block_size[-1]), weight_block_size
+ else:
+ return (-1, -1), (-1, -1) # per-tensor, per-tensor
+ elif isinstance(layer.quant_method, CompressedTensorsLinearMethod)\
+ and isinstance(layer.scheme, CompressedTensorsW8A8Fp8):
+ # this is hacky but we always assume the for
+ # CompressedTensorsW8A8Fp8 the input is dynamic per-token
+ # we ignore if it is static-per-tensor since we are going to
+ # requantize after later anyways
+ strategy = layer.scheme.strategy
+ if strategy == QuantizationStrategy.TENSOR:
+ return (1, -1), (-1, -1) # per-token, per-tensor
+ elif strategy == QuantizationStrategy.CHANNEL:
+ return (1, -1), (-1, 1) # per-token, per-channel
+ else:
+ raise NotImplementedError(
+ f"QuantizationStrategy.{strategy} is not supported for "
+ "fp8 MLA, please run with VLLM_MLA_DISABLE=1")
+ else:
+ raise NotImplementedError(
+ "Can't determine scale group shapes for "
+ f"{layer.quant_method}, please run with VLLM_MLA_DISABLE=1"
+ )
+
+ def get_scales(layer: LinearBase) -> torch.Tensor:
+ if hasattr(layer, "weight_scale_inv"):
+ return layer.weight_scale_inv
+ return layer.weight_scale
+
+ def get_and_maybe_dequant_weights(layer: LinearBase):
+ if is_layer_fp8(layer):
+ if isinstance(layer.quant_method, \
+ CompressedTensorsLinearMethod) and \
+ isinstance(layer.scheme, CompressedTensorsW8A8Fp8):
+ # NOTE(lucas): note sure why but `CompressedTensorsW8A8Fp8`
+ # seems to store weights as (input, output) instead of
+ # (output, input) so we need to transpose
+ weight = layer.weight.T # standardize to (output, input)
+ else:
+ weight = layer.weight
+ _, weight_scale_group_shape = \
+ get_scale_group_shapes_for_fp8(layer)
+ scales = get_scales(layer)
+
+ return scaled_dequantize(weight, scales,
+ weight_scale_group_shape)
+ else:
+ return layer.weight
+
+ if not (quantization_scheme_supported(self.kv_b_proj) and\
+ quantization_scheme_supported(self.q_proj) and\
+ quantization_scheme_supported(self.o_proj)):
+ raise NotImplementedError(
+ "Only FP8 and UnquantizedLinearMethod are supported for MLA"
+ ", please run with VLLM_MLA_DISABLE=1")
+
+ weight_dtype = self.kv_b_proj.weight.dtype
+ assert self.o_proj.weight.dtype == weight_dtype
+ assert self.q_proj.weight.dtype == weight_dtype
+
+ kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T
assert kv_b_proj_weight.shape == (
self.kv_lora_rank,
self.num_heads * (self.qk_nope_head_dim + self.v_head_dim)), (
@@ -198,18 +310,35 @@ def process_weights_after_loading(self):
W_UK, W_UV = kv_b_proj_weight.split(
[self.qk_nope_head_dim, self.v_head_dim], dim=-1)
- q_proj = self.q_proj.weight.T\
+ q_proj_weight = get_and_maybe_dequant_weights(self.q_proj).T\
.view(-1, self.num_heads, self.qk_head_dim)
# can be W_Q or W_UQ depending q_lora_rank, the former if
# q_lora_rank is None, the latter otherwise. From the Attention backend
# perspective though we call these both W_Q and rely on the layer
# to pass in the correct matrix
- W_Q = q_proj[..., :self.qk_nope_head_dim]
- self.W_QR = q_proj[..., self.qk_nope_head_dim:]\
+ W_Q = q_proj_weight[..., :self.qk_nope_head_dim]
+ self.W_QR = q_proj_weight[..., self.qk_nope_head_dim:]\
.flatten(start_dim=1).contiguous()
+ # W_QR is small so for simplicity we dont bother requantizing it
+ self.W_QR = self.W_QR.to(act_dtype)
+
if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
+ requantization_enabled = not envs.VLLM_MLA_DISABLE_REQUANTIZATION
+ if is_fp8(weight_dtype) and requantization_enabled:
+ # This assumes it wise to requantize using the same group shapes
+ # (i.e. strategy, per-tensor, per-channel, block etc.) that the
+ # weights were originally quantized
+ requant_input_group_shape, requant_weight_group_shape = \
+ get_scale_group_shapes_for_fp8(self.q_proj)
+ assert (requant_input_group_shape, requant_weight_group_shape)\
+ == get_scale_group_shapes_for_fp8(self.kv_b_proj)
+ assert (requant_input_group_shape, requant_weight_group_shape)\
+ == get_scale_group_shapes_for_fp8(self.o_proj)
+ self.reqaunt_input_group_shape = requant_input_group_shape
+ self.reqaunt_weight_group_shape = requant_weight_group_shape
+
#
# Perform matrix-absorption following
# https://github.com/flashinfer-ai/flashinfer/pull/551
@@ -223,25 +352,44 @@ def process_weights_after_loading(self):
# latter otherwise
# basically if q_lora_rank is none we are absorbing into q_proj
# instead of UQ
- self.W_Q_UK = torch.einsum("qnd,lnd -> qnl", W_Q, W_UK)\
+ W_Q_UK = torch.einsum("qnd,lnd -> qnl", W_Q, W_UK)\
.flatten(start_dim=1).contiguous()
- W_O = self.o_proj.weight\
+ if is_fp8(weight_dtype) and requantization_enabled:
+ W_Q_UK, W_Q_UK_scales = scaled_quantize(
+ W_Q_UK,
+ self.reqaunt_weight_group_shape,
+ quant_dtype=current_platform_fp8_dtype)
+ # For FP8 save the transpose so we can use
+ # `apply_w8a8_block_fp8_linear` directly
+ self.W_Q_UK = W_Q_UK.T.contiguous()
+ self.W_Q_UK_scales = W_Q_UK_scales.T.contiguous()
+ else:
+ self.W_Q_UK = W_Q_UK.to(act_dtype)
+
+ W_O = get_and_maybe_dequant_weights(self.o_proj)\
.view(-1, self.num_heads, self.v_head_dim)
- self.W_UV_O = torch.einsum("lnd,hnd -> nlh", W_UV, W_O)\
+ W_UV_O = torch.einsum("lnd,hnd -> nlh", W_UV, W_O)\
.flatten(start_dim=0, end_dim=1).contiguous()
- tp_size = get_tensor_model_parallel_world_size()
- self.o_proj_absorbed = RowParallelLinear(
- self.W_UV_O.shape[0] * tp_size,
- self.W_UV_O.shape[1],
- bias=False,
- # TODO(lucas) figure out how to properly forward quant_method
- #quant_config=self.o_proj.quant_method,
- )
-
- self.o_proj_absorbed.weight = torch.nn.Parameter(self.W_UV_O.T)
+ if is_fp8(weight_dtype) and requantization_enabled:
+ W_UV_O, W_UV_O_scales = scaled_quantize(
+ W_UV_O,
+ self.reqaunt_weight_group_shape,
+ quant_dtype=current_platform_fp8_dtype)
+ # For FP8 save the transpose so we can use
+ # `apply_w8a8_block_fp8_linear` directly
+ self.W_UV_O = W_UV_O.T.contiguous()
+ self.W_UV_O_scales = W_UV_O_scales.T.contiguous()
+ else:
+ self.W_UV_O = W_UV_O.to(act_dtype)
+
+ self.tp_size = get_tensor_model_parallel_world_size()
else:
+ if is_fp8(weight_dtype):
+ raise NotImplementedError(
+ "Currently fp8 requires matrix absorption")
+
self.W_UV = W_UV
self.W_UK = W_UK
self.W_Q = W_Q.flatten(start_dim=1)
diff --git a/vllm/attention/backends/triton_mla.py b/vllm/attention/backends/triton_mla.py
index da09bb70b4f1a..95dc119a47bb5 100644
--- a/vllm/attention/backends/triton_mla.py
+++ b/vllm/attention/backends/triton_mla.py
@@ -57,14 +57,12 @@ def get_state_cls() -> Type["TritonMLAState"]:
@staticmethod
def get_kv_cache_shape(
- num_blocks: int,
- block_size: int,
- num_kv_heads: int, # assumed to be 1 for MLA
- kv_lora_rank: int, # passed via head_size
+ num_blocks: int,
+ block_size: int,
+ num_kv_heads: int, # assumed to be 1 for MLA
+ head_size: int,
) -> Tuple[int, ...]:
- # TODO(lucas): remove hardcoding k_pe size as 1/8th of kv_lora_rank
- k_pe_size = kv_lora_rank // 8
- return (num_blocks, block_size, kv_lora_rank + k_pe_size)
+ return (num_blocks, block_size, head_size)
@staticmethod
def swap_blocks(
@@ -83,7 +81,7 @@ def copy_blocks(
@staticmethod
def get_supported_head_sizes() -> List[int]:
- return [512]
+ return [576]
class TritonMLAState(AttentionState):
@@ -624,8 +622,6 @@ def build(self, seq_lens: List[int], query_lens: List[int],
self.multimodal_placeholder_maps.items()
}
- num_kv_splits = 8
-
return TritonMLAMetadata(
num_prefills=self.num_prefills,
slot_mapping=slot_mapping_tensor,
@@ -645,7 +641,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
context_lens_tensor=context_lens_tensor,
block_tables=block_tables,
use_cuda_graph=use_captured_graph,
- num_kv_splits=num_kv_splits,
+ num_kv_splits=4, # TODO(lucas) add heuristic
head_dim=self.runner.model_config.get_head_size(),
)
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index a10daa351d85b..b7e8d536de9f0 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -206,9 +206,9 @@ def extra_repr(self) -> str:
s += f", backend={self.impl.__class__.__name__}"
return s
- def process_weights_after_loading(self):
+ def process_weights_after_loading(self, act_dtype: torch.dtype):
if hasattr(self.impl, "process_weights_after_loading"):
- self.impl.process_weights_after_loading()
+ self.impl.process_weights_after_loading(act_dtype)
class MultiHeadAttention(nn.Module):
diff --git a/vllm/config.py b/vllm/config.py
index 04c36fadf9323..84dbe2d5a6edd 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -744,16 +744,16 @@ def is_deepseek_mla(self) -> bool:
# TODO add deepseek_v3
return hasattr(self.hf_text_config,
"model_type") and (self.hf_text_config.model_type
- in ('deepseek_v2'))
+ in ('deepseek_v2', 'deepseek_v3'))
def get_head_size(self) -> int:
# TODO remove hard code
if self.is_deepseek_mla:
+ qk_rope_head_dim = getattr(self.hf_text_config, "qk_rope_head_dim",
+ 0)
if self.use_mla:
- return self.hf_text_config.kv_lora_rank
+ return self.hf_text_config.kv_lora_rank + qk_rope_head_dim
else:
- qk_rope_head_dim = getattr(self.hf_text_config,
- "qk_rope_head_dim", 0)
qk_nope_head_dim = getattr(self.hf_text_config,
"qk_nope_head_dim", 0)
if qk_rope_head_dim and qk_nope_head_dim:
@@ -972,6 +972,32 @@ def is_cross_encoder(self) -> bool:
@property
def use_mla(self) -> bool:
+ if self.quantization is not None and self.quantization not in [\
+ "fp8", "compressed-tensors"]:
+ logger.warning(
+ "MLA is not supported with %s quantization. "
+ "Disabling MLA.", self.quantization)
+ return False
+
+ # If using a "compressed-tensors" checkpoint, check that all groups
+ # have fp8 for both weights and activations.
+ if self.quantization == "compressed-tensors":
+ quant_config = self._parse_quant_hf_config()
+ if self.quantization == "compressed-tensors":
+ quant_config = self._parse_quant_hf_config()
+ for group_name, cfg in quant_config.get("config_groups",
+ {}).items():
+ act_type = cfg.get("input_activations", {}).get("type", "")
+ weight_type = cfg.get("weights", {}).get("type", "")
+ if act_type != "fp8" or weight_type != "fp8":
+ logger.warning(
+ "compressed-tensors MLA support requires fp8 "
+ "activations and weights in group '%s', but got "
+ "activations type '%s' and weights type '%s'.\n "
+ "Full config: %s", group_name, act_type, weight_type,
+ quant_config)
+ return False
+
use_mla = (self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE)
return use_mla
diff --git a/vllm/envs.py b/vllm/envs.py
index 1d773f15dd259..3fd5d8c107236 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -91,6 +91,7 @@
VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128
VLLM_MLA_DISABLE: bool = False
VLLM_MLA_PERFORM_MATRIX_ABSORPTION: bool = True
+ VLLM_MLA_DISABLE_REQUANTIZATION: bool = False
def get_default_cache_root():
@@ -592,7 +593,16 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
# storing more weights, W_Q_UK and W_UV_O, so can increase memory usage,
# the is enabled by default
"VLLM_MLA_PERFORM_MATRIX_ABSORPTION":
- lambda: bool(int(os.getenv("VLLM_MLA_PERFORM_MATRIX_ABSORPTION", "1")))
+ lambda: bool(int(os.getenv("VLLM_MLA_PERFORM_MATRIX_ABSORPTION", "1"))),
+
+ # When running MLA with matrix-absorption enabled and fp8 quantized weights
+ # we perform the matrix-absorption in float32 precision, after the matrices
+ # are absorbed we requantize the weights back to fp8, this flag can be used
+ # to disable the requantization step, and instead convert the absorbed
+ # matrices to match the activation type. This can lead to higher memory and
+ # compute usage but better preserves the accuracy of the original model.
+ "VLLM_MLA_DISABLE_REQUANTIZATION":
+ lambda: bool(int(os.getenv("VLLM_MLA_DISABLE_REQUANTIZATION", "0")))
}
# end-env-vars-definition
diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index 2d8594cb8aafa..ee30ce96f0a1e 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -307,8 +307,8 @@ def __call__(self, input_ids: list[int],
# Note: In this method, if the tensors have different dimensions
# on CPU device fails, but on GPU it runs without error. Hence the
# unsqueeze above for scores, to match the token bitmask shape
- xgr.apply_token_bitmask_inplace(scores,
- self.token_bitmask.to(scores.device))
+ xgr.apply_token_bitmask_inplace(
+ scores, self.token_bitmask.to(scores.device, non_blocking=True))
if device_type != "cuda":
scores = scores.to(dtype).to(device_type).squeeze()
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 83951d614721c..a0395beddd1a4 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -661,36 +661,17 @@ def get_default_config(
is_marlin: bool,
block_shape: Optional[List[int]] = None,
) -> Dict[str, int]:
- if dtype == "fp8_w8a8":
- if block_shape is None:
- config = {
- "BLOCK_SIZE_M": 128,
- "BLOCK_SIZE_N": 256,
- "BLOCK_SIZE_K": 128,
- "GROUP_SIZE_M": 32,
- "num_warps": 8,
- "num_stages": 4,
- }
- if M <= E:
- config = {
- "BLOCK_SIZE_M": 64,
- "BLOCK_SIZE_N": 128,
- "BLOCK_SIZE_K": 128,
- "GROUP_SIZE_M": 1,
- "num_warps": 4,
- "num_stages": 4,
- }
- else:
- # Block-wise quant: BLOCK_SIZE_N must be divisible by block_shape[0]
- # BLOCK_SIZE_K must be divisible by block_shape[1]
- config = {
- "BLOCK_SIZE_M": 64,
- "BLOCK_SIZE_N": block_shape[0],
- "BLOCK_SIZE_K": block_shape[1],
- "GROUP_SIZE_M": 32,
- "num_warps": 4,
- "num_stages": 3,
- }
+ if dtype == "fp8_w8a8" and block_shape is not None:
+ # Block-wise quant: BLOCK_SIZE_N must be divisible by block_shape[0]
+ # BLOCK_SIZE_K must be divisible by block_shape[1]
+ config = {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": block_shape[0],
+ "BLOCK_SIZE_K": block_shape[1],
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3,
+ }
else:
config = {
"BLOCK_SIZE_M": 64,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 5ba574d34010a..94ed6cc48a742 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -22,7 +22,8 @@
is_layer_skipped)
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
all_close_1d, apply_fp8_linear, convert_to_channelwise,
- cutlass_fp8_supported, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize,
+ cutlass_block_fp8_supported, cutlass_fp8_supported,
+ normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize,
requantize_with_max_scale)
from vllm.model_executor.parameter import (BlockQuantScaleParameter,
ModelWeightParameter,
@@ -155,6 +156,7 @@ class Fp8LinearMethod(LinearMethodBase):
def __init__(self, quant_config: Fp8Config):
self.quant_config = quant_config
self.cutlass_fp8_supported = cutlass_fp8_supported()
+ self.cutlass_block_fp8_supported = cutlass_block_fp8_supported()
# For GPUs that lack FP8 hardware support, we can leverage the Marlin
# kernel for fast weight-only FP8 quantization
@@ -270,20 +272,24 @@ def create_weights(
layer.register_parameter("input_scale", None)
def process_weights_after_loading(self, layer: Module) -> None:
- # Block quant doesn't need to process weights after loading
+ # TODO(rob): refactor block quant into separate class.
if self.block_quant:
+ assert self.quant_config.activation_scheme == "dynamic"
if current_platform.is_rocm() and not is_navi():
- weight, weight_scale, _ = \
+ weight, weight_scale_inv, _ = (
normalize_e4m3fn_to_e4m3fnuz(
weight=layer.weight,
- weight_scale=layer.weight_scale_inv,
- input_scale=layer.input_scale)
- layer.weight = Parameter(weight, requires_grad=False)
- layer.weight_scale_inv = Parameter(weight_scale,
- requires_grad=False)
+ weight_scale=layer.weight_scale_inv))
+ else:
+ weight = layer.weight.data
+ weight_scale_inv = layer.weight_scale_inv.data
+
+ # Torch.compile cannot use Parameter subclasses.
+ layer.weight = Parameter(weight, requires_grad=False)
+ layer.weight_scale_inv = Parameter(weight_scale_inv,
+ requires_grad=False)
return
- layer.weight = torch.nn.Parameter(layer.weight.data,
- requires_grad=False)
+
# If checkpoint not serialized fp8, quantize the weights.
if not self.quant_config.is_checkpoint_fp8_serialized:
qweight, weight_scale = ops.scaled_fp8_quant(layer.weight,
@@ -392,6 +398,7 @@ def apply(self,
weight_scale=layer.weight_scale_inv,
input_scale=layer.input_scale,
bias=bias,
+ cutlass_block_fp8_supported=self.cutlass_block_fp8_supported,
)
return apply_fp8_linear(
@@ -545,8 +552,9 @@ def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
layer.w2_input_scale = None
def process_weights_after_loading(self, layer: Module) -> None:
- # Block quant doesn't need to process weights after loading
+ # TODO (rob): refactor block quant into separate class.
if self.block_quant:
+ assert self.quant_config.activation_scheme == "dynamic"
if current_platform.is_rocm() and not is_navi():
w13_weight, w13_weight_scale_inv, w13_input_scale = \
normalize_e4m3fn_to_e4m3fnuz(
@@ -556,22 +564,21 @@ def process_weights_after_loading(self, layer: Module) -> None:
normalize_e4m3fn_to_e4m3fnuz(
layer.w2_weight, layer.w2_weight_scale_inv,
layer.w2_input_scale)
- # Reset the parameter
- layer.w13_weight = torch.nn.Parameter(w13_weight,
- requires_grad=False)
- layer.w13_weight_scale_inv = torch.nn.Parameter(
- w13_weight_scale_inv, requires_grad=False)
- if w13_input_scale is not None:
- layer.w13_input_scale = torch.nn.Parameter(
- w13_input_scale, requires_grad=False)
- layer.w2_weight = torch.nn.Parameter(w2_weight,
- requires_grad=False)
- layer.w2_weight_scale_inv = torch.nn.Parameter(
- w2_weight_scale_inv, requires_grad=False)
- if w2_input_scale is not None:
- layer.w2_input_scale = torch.nn.Parameter(
- w2_input_scale, requires_grad=False)
+ else:
+ w13_weight = layer.w13_weight.data
+ w13_weight_scale_inv = layer.w13_weight_scale_inv.data
+ w2_weight = layer.w2_weight
+ w2_weight_scale_inv = layer.w2_weight_scale_inv
+
+ # torch.compile() cannot use Parameter subclasses.
+ layer.w13_weight = Parameter(w13_weight, requires_grad=False)
+ layer.w13_weight_scale_inv = Parameter(w13_weight_scale_inv,
+ requires_grad=False)
+ layer.w2_weight = Parameter(w2_weight, requires_grad=False)
+ layer.w2_weight_scale_inv = Parameter(w2_weight_scale_inv,
+ requires_grad=False)
return
+
# If checkpoint is fp16, quantize in place.
if not self.quant_config.is_checkpoint_fp8_serialized:
# If rocm (except Navi4x), use float8_e4m3fnuz as dtype
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index f256ecb8ee1b7..ba713a1d1974c 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -2,18 +2,33 @@
import functools
import json
import os
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple, Union
import torch
import triton
import triton.language as tl
+from vllm import _custom_ops as ops
from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+ _normalize_quant_group_shape, scaled_dequantize)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+ apply_fp8_linear)
from vllm.platforms import current_platform
from vllm.utils import is_navi
logger = init_logger(__name__)
+current_platform_fp8_dtype = (torch.float8_e4m3fnuz
+ if current_platform.is_rocm() else
+ torch.float8_e4m3fn)
+
+
+def is_fp8(x: Union[torch.dtype, torch.Tensor]) -> bool:
+ if isinstance(x, torch.Tensor):
+ x = x.dtype
+ return x == torch.float8_e4m3fn or x == torch.float8_e4m3fnuz
+
def apply_w8a8_block_fp8_linear(
input: torch.Tensor,
@@ -22,25 +37,75 @@ def apply_w8a8_block_fp8_linear(
weight_scale: torch.Tensor,
input_scale: Optional[torch.Tensor] = None,
bias: Optional[torch.Tensor] = None,
+ cutlass_block_fp8_supported: bool = True,
) -> torch.Tensor:
assert input_scale is None
# View input as 2D matrix for fp8 methods
input_2d = input.view(-1, input.shape[-1])
output_shape = [*input.shape[:-1], weight.shape[0]]
- q_input, x_scale = per_token_group_quant_fp8(input_2d, block_size[1])
- output = w8a8_block_fp8_matmul(q_input,
- weight,
- x_scale,
- weight_scale,
- block_size,
- output_dtype=input.dtype)
-
+ shape_supported_by_cutlass = (weight.shape[0] % 128 == 0
+ and weight.shape[1] % 128 == 0)
+ if cutlass_block_fp8_supported and shape_supported_by_cutlass:
+ q_input, x_scale = per_token_group_quant_fp8(input_2d,
+ block_size[1],
+ column_major_scales=True)
+ output = ops.cutlass_scaled_mm(q_input,
+ weight.T,
+ out_dtype=input.dtype,
+ scale_a=x_scale,
+ scale_b=weight_scale.T)
+ else:
+ q_input, x_scale = per_token_group_quant_fp8(input_2d,
+ block_size[1],
+ column_major_scales=False)
+ output = w8a8_block_fp8_matmul(q_input,
+ weight,
+ x_scale,
+ weight_scale,
+ block_size,
+ output_dtype=input.dtype)
if bias is not None:
output = output + bias
return output.to(dtype=input.dtype).view(*output_shape)
+# Unify the interface between `apply_w8a8_block_fp8_linear` and
+# `apply_fp8_linear`
+# NOTE(lucas): this is quite messy, we should think through this more formally
+def apply_fp8_linear_generic(
+ input: torch.Tensor,
+ weight: torch.Tensor,
+ weight_scale: torch.Tensor,
+ input_group_shape: Tuple[int, int],
+ weight_group_shape: Tuple[int, int],
+ input_scale: Optional[torch.Tensor] = None, # static scale if one
+) -> torch.Tensor:
+ # View input as 2D matrix for fp8 methods
+ input = input.view(-1, input.shape[-1])
+
+ weight_group_shape = _normalize_quant_group_shape(\
+ weight, weight_group_shape)
+ input_group_shape = _normalize_quant_group_shape(input, input_group_shape)
+
+ def is_dim_blocked(dim, shape, group_shape):
+ return group_shape < shape[dim] and group_shape > 1
+
+ if is_dim_blocked(0, weight.shape, weight_group_shape[0])\
+ and is_dim_blocked(1, weight.shape, weight_group_shape[1]) and\
+ input_group_shape == (1, weight_group_shape[1]):
+ return apply_w8a8_block_fp8_linear(input, weight,
+ list(weight_group_shape),
+ weight_scale)
+ else:
+ # Despite having linear in the it doesn't conform to
+ # `torch.nn.functional.linear` which is defined as `input @ weight.T`
+ # so we explicitly transpose the weight matrix here
+ return apply_fp8_linear(input, weight.T, weight_scale.T,
+ use_per_token_if_dynamic=\
+ (input_group_shape == (1, input.shape[1])))
+
+
def input_to_float8(
x: torch.Tensor,
dtype: Optional[torch.dtype] = None
@@ -61,7 +126,6 @@ def input_to_float8(
def block_quant_to_tensor_quant(
x_q_block: torch.Tensor,
x_s: torch.Tensor,
- block_size: List[int],
) -> Tuple[torch.Tensor, torch.Tensor]:
"""This function converts block-wise quantization to tensor-wise
quantization. The inputs are block-wise quantization tensor `x_q_block`,
@@ -69,26 +133,7 @@ def block_quant_to_tensor_quant(
The outputs are tensor-wise quantization tensor and tensor-wise
quantization scale. Note only float8 is supported for now.
"""
- block_n, block_k = block_size[0], block_size[1]
- n, k = x_q_block.shape
- n_tiles = (n + block_n - 1) // block_n
- k_tiles = (k + block_k - 1) // block_k
- assert n_tiles == x_s.shape[0]
- assert k_tiles == x_s.shape[1]
-
- x_dq_block = x_q_block.to(torch.float32)
-
- x_dq_block_tiles = [[
- x_dq_block[
- j * block_n:min((j + 1) * block_n, n),
- i * block_k:min((i + 1) * block_k, k),
- ] for i in range(k_tiles)
- ] for j in range(n_tiles)]
-
- for i in range(k_tiles):
- for j in range(n_tiles):
- x_dq_block_tiles[j][i][:, :] = x_dq_block_tiles[j][i] * x_s[j][i]
-
+ x_dq_block = scaled_dequantize(x_q_block, x_s)
x_q_tensor, scale = input_to_float8(x_dq_block, dtype=x_q_block.dtype)
return x_q_tensor, scale
@@ -99,10 +144,7 @@ def _per_token_group_quant_fp8(
y_ptr,
y_q_ptr,
y_s_ptr,
- # Stride of input
- y_stride,
- # Columns of input
- N,
+ group_size,
# Avoid to divide zero
eps,
# Information for float8
@@ -117,12 +159,60 @@ def _per_token_group_quant_fp8(
"""
# Map the program id to the row of X and Y it should compute.
g_id = tl.program_id(0)
- y_ptr += g_id * y_stride
- y_q_ptr += g_id * y_stride
+ y_ptr += g_id * group_size
+ y_q_ptr += g_id * group_size
y_s_ptr += g_id
cols = tl.arange(0, BLOCK) # N <= BLOCK
- mask = cols < N
+ mask = cols < group_size
+
+ y = tl.load(y_ptr + cols, mask=mask, other=0.0).to(tl.float32)
+ # Quant
+ _absmax = tl.maximum(tl.max(tl.abs(y)), eps)
+ y_s = _absmax / fp8_max
+ y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty)
+
+ tl.store(y_q_ptr + cols, y_q, mask=mask)
+ tl.store(y_s_ptr, y_s)
+
+
+@triton.jit
+def _per_token_group_quant_fp8_colmajor(
+ # Pointers to inputs and output
+ y_ptr,
+ y_q_ptr,
+ y_s_ptr,
+ group_size,
+ # Num columns of y
+ y_num_columns,
+ # Stride from one column to the next of y_s
+ y_s_col_stride,
+ # Avoid to divide zero
+ eps,
+ # Information for float8
+ fp8_min,
+ fp8_max,
+ # Meta-parameters
+ BLOCK: tl.constexpr,
+):
+ """A Triton-accelerated function to perform per-token-group
+ quantization on a tensor.
+ This function converts the tensor values into float8 values.
+ """
+ # Map the program id to the row of X and Y it should compute.
+ g_id = tl.program_id(0)
+ y_ptr += g_id * group_size
+ y_q_ptr += g_id * group_size
+
+ # Convert g_id the flattened block coordinate to 2D so we can index
+ # into the output y_scales matrix
+ blocks_per_row = y_num_columns // group_size
+ scale_col = g_id % blocks_per_row
+ scale_row = g_id // blocks_per_row
+ y_s_ptr += scale_col * y_s_col_stride + scale_row
+
+ cols = tl.arange(0, BLOCK) # group_size <= BLOCK
+ mask = cols < group_size
y = tl.load(y_ptr + cols, mask=mask, other=0.0).to(tl.float32)
# Quant
@@ -139,12 +229,13 @@ def per_token_group_quant_fp8(
group_size: int,
eps: float = 1e-10,
dtype: Optional[torch.dtype] = None,
+ column_major_scales: bool = False,
) -> Tuple[torch.Tensor, torch.Tensor]:
"""Function to perform per-token-group quantization on an input tensor `x`.
It converts the tensor values into signed float8 values and returns the
quantized tensor along with the scaling factor used for quantization.
Args:
- x: The input tenosr with ndim >= 2.
+ x: The input tensor with ndim >= 2.
group_size: The group size used for quantization.
eps: The minimum to avoid dividing zero.
dtype: The dype of output tensor. Note that only `torch.float8_e4m3fn`
@@ -168,29 +259,46 @@ def per_token_group_quant_fp8(
x_q = torch.empty_like(x, device=x.device, dtype=dtype)
M = x.numel() // group_size
N = group_size
- x_s = torch.empty(
- x.shape[:-1] + (x.shape[-1] // group_size, ),
- device=x.device,
- dtype=torch.float32,
- )
+ if column_major_scales:
+ shape = (x.shape[-1] // group_size, ) + x.shape[:-1]
+ x_s = torch.empty(shape, device=x.device,
+ dtype=torch.float32).permute(-1, -2)
+ else:
+ shape = x.shape[:-1] + (x.shape[-1] // group_size, )
+ x_s = torch.empty(shape, device=x.device, dtype=torch.float32)
BLOCK = triton.next_power_of_2(N)
# heuristics for number of warps
num_warps = min(max(BLOCK // 256, 1), 8)
num_stages = 1
- _per_token_group_quant_fp8[(M, )](
- x,
- x_q,
- x_s,
- group_size,
- N,
- eps,
- fp8_min=fp8_min,
- fp8_max=fp8_max,
- BLOCK=BLOCK,
- num_warps=num_warps,
- num_stages=num_stages,
- )
+ if column_major_scales:
+ _per_token_group_quant_fp8_colmajor[(M, )](
+ x,
+ x_q,
+ x_s,
+ group_size,
+ x.shape[1],
+ x_s.stride(1),
+ eps,
+ fp8_min=fp8_min,
+ fp8_max=fp8_max,
+ BLOCK=BLOCK,
+ num_warps=num_warps,
+ num_stages=num_stages,
+ )
+ else:
+ _per_token_group_quant_fp8[(M, )](
+ x,
+ x_q,
+ x_s,
+ group_size,
+ eps,
+ fp8_min=fp8_min,
+ fp8_max=fp8_max,
+ BLOCK=BLOCK,
+ num_warps=num_warps,
+ num_stages=num_stages,
+ )
return x_q, x_s
diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py
index 83055d6000d83..95e785dcc4078 100644
--- a/vllm/model_executor/layers/quantization/utils/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
@@ -1,5 +1,5 @@
"""This file is used for /tests and /benchmarks"""
-from typing import List, Optional
+from typing import List, Optional, Tuple
import numpy
import torch
@@ -20,6 +20,120 @@
}
+# Normalize the group_shape to the full extent for any dims that are -1
+def _normalize_quant_group_shape(x: torch.Tensor, group_shape: Tuple[int,
+ int]):
+ # -1 means full extent
+ return (group_shape[0] if group_shape[0] > 0 else x.shape[-2],
+ group_shape[1] if group_shape[1] > 0 else x.shape[-1])
+
+
+# Useful when treating N-dimensional group scaling as extended numpy-style
+# broadcasting in numpy simply stretches dimensions with an extent of 1 to match
+# the target shape by repeating the data along that dimension (broadcasting)
+# , we extend these semantics to say if the extent of a dimension in the
+# source shape is not 1 and does not match the target shape we repeat each
+# element along that dimension src_shape[dim] // target_shape[dim] times
+# example if we have:
+# a = [[1, 2], and target_shape = (2, 4)
+# [3, 4]]
+# then we would expand a to:
+# a = [[1, 1, 2, 2],
+# [3, 3, 4, 4]]
+# NOTE this function this function does not explicitly broadcast dimensions
+# with an extent of 1, since this can be done implicitly by pytorch
+def group_broadcast(t, shape):
+ for i, s in enumerate(shape):
+ if t.shape[i] != s and t.shape[i] != 1:
+ assert s % t.shape[i] == 0
+ t = t.unsqueeze(i + 1)\
+ .expand(*t.shape[:i+1], s // t.shape[i], *t.shape[i+1:])\
+ .flatten(i, i + 1)
+ return t
+
+
+# Quantize assuming once scale per group of elements with shape group_shape,
+# example group shapes:
+# * (-1, -1) for per-tensor quantization
+# * (1, -1) for per-row quantization
+# * (-1, 1) for per-column quantization
+# * (128, 128) for 128x128 deepseek style block quantization
+# * (1, 128) for deepseek style activation quantization
+# (i.e. per-token-per-group)
+def scaled_quantize(
+ x: torch.Tensor,
+ group_shape: Tuple[int, int],
+ quant_dtype: torch.dtype,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+ group_shape = _normalize_quant_group_shape(x, group_shape)
+ assert quant_dtype.is_floating_point, \
+ "currently `scaled_quantize` only supports floating point dtypes " \
+ "but could be extended to support other dtypes"
+
+ finfo = torch.finfo(quant_dtype)
+
+ # Reshape (M, N) into (BLK_M, BLOCK_SIZE_M, BLK_N, BLOCK_SIZE_N)
+ assert x.ndim == 2
+ assert x.shape[0] % group_shape[0] == 0 and x.shape[1] % group_shape[1] == 0
+ blk_m, blk_n = x.shape[0] // group_shape[0], x.shape[1] // group_shape[1]
+ x_blkd = x.reshape(blk_m, group_shape[0], blk_n, group_shape[1])
+
+ # Permute to (BLK_M, BLK_N, BLOCK_SIZE_M, BLOCK_SIZE_N)
+ x_blkd_permd = x_blkd.permute(0, 2, 1, 3)
+ # Flatten to (BLK_M, BLK_N, BLOCK_SIZE_M * BLOCK_SIZE_N)
+ x_blkd_permd = x_blkd_permd.flatten(start_dim=2)
+
+ # Compute scales
+ min_val, max_val = x_blkd_permd.aminmax(dim=-1)
+ amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
+ scale = finfo.max / amax
+
+ # Apply scale and convert form:
+ # (BLK_M, BLK_N, BLOCK_SIZE_M * BLOCK_SIZE_N) to (M, N)
+ x_scl_sat = (x_blkd_permd * scale.unsqueeze(-1))\
+ .clamp(min=finfo.min, max=finfo.max)\
+ .reshape(blk_m, blk_n, group_shape[0], group_shape[1])\
+ .permute(0, 2, 1, 3)\
+ .reshape(x.shape)
+
+ return x_scl_sat.to(quant_dtype).contiguous(), scale.float().reciprocal()
+
+
+# inverses `scaled_quantize`
+def scaled_dequantize(
+ x_q: torch.Tensor,
+ x_s: torch.Tensor,
+ group_shape: Optional[Tuple[int, int]] = None,
+ out_dtype: torch.dtype = torch.float32,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+ if group_shape is not None:
+ group_shape = _normalize_quant_group_shape(x_q, group_shape)
+
+ if x_s.ndim == 0: # scalar
+ x_s = x_s.unsqueeze(-1).unsqueeze(-1) # convert to (1, 1) tensor
+ if x_s.ndim == 1:
+ if group_shape is None:
+ raise AssertionError(
+ "if x_s is 1D tensor, group_shape must be provided otherwise "
+ "its ambiguous which dimension to broadcast x_s to")
+ # unsqueeze the scales for the dimension where we want to broadcast
+ # across the full extent
+ if group_shape[0] == x_q.shape[-2]:
+ x_s = x_s.unsqueeze(-2)
+ elif group_shape[1] == x_q.shape[-1]:
+ x_s = x_s.unsqueeze(-1)
+ else:
+ raise AssertionError(
+ "if x_s is a vector we should be broadcasting it to the full "
+ "extent of one of the dimensions")
+
+ if group_shape is not None:
+ assert x_s.shape[-1] == x_q.shape[-1] // group_shape[1]
+ assert x_s.shape[-2] == x_q.shape[-2] // group_shape[0]
+ x_s = group_broadcast(x_s.to(torch.float32), x_q.shape)
+ return (x_q.to(torch.float32) * x_s).to(out_dtype)
+
+
def pack_quantized_values_into_int32(w_q: torch.Tensor,
wtype: ScalarType,
packed_dim: int = 0):
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index c93a3951731e8..1d802f0e47dcf 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -30,6 +30,16 @@ def cutlass_fp8_supported() -> bool:
return ops.cutlass_scaled_mm_supports_fp8(capability)
+def cutlass_block_fp8_supported() -> bool:
+ if not current_platform.is_cuda():
+ return False
+
+ capability_tuple = current_platform.get_device_capability()
+ capability = -1 if capability_tuple is None else capability_tuple.to_int()
+
+ return ops.cutlass_scaled_mm_supports_block_fp8(capability)
+
+
def per_tensor_dequantize(
tensor: torch.Tensor, inv_scale: Union[float,
torch.Tensor]) -> torch.Tensor:
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 62babcddd61b1..4be511d12838d 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -398,11 +398,13 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module:
# parameters onto device for processing and back off after.
with device_loading_context(module, target_device):
quant_method.process_weights_after_loading(module)
- elif isinstance(module, Attention) and \
+ if isinstance(module, Attention) and \
hasattr(module, "process_weights_after_loading"):
# When attention modules need to process weights after
# currently only used by MLA
- module.process_weights_after_loading()
+ # TODO(lucas): see if there is a way to unify the signatures
+ # of process_weights_after_loading
+ module.process_weights_after_loading(model_config.dtype)
return model.eval()
@@ -439,6 +441,11 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module:
with device_loading_context(
module, torch.device(device_config.device)):
quant_method.process_weights_after_loading(module)
+ if isinstance(module, Attention) and \
+ hasattr(module, "process_weights_after_loading"):
+ # When attention modules need to process weights after
+ # currently only used by MLA
+ module.process_weights_after_loading(model_config.dtype)
return model.eval()
@@ -633,6 +640,12 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module:
quant_method = getattr(module, "quant_method", None)
if quant_method is not None:
quant_method.process_weights_after_loading(module)
+ if isinstance(module, Attention) and \
+ hasattr(module, "process_weights_after_loading"):
+ # When attention modules need to process weights after
+ # currently only used by MLA
+ module.process_weights_after_loading(
+ model_config.dtype)
rank = get_tensor_model_parallel_rank()
pattern = os.path.join(
local_model_path,
@@ -1272,7 +1285,7 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module:
class RunaiModelStreamerLoader(BaseModelLoader):
"""
- Model loader that can load safetensors
+ Model loader that can load safetensors
files from local FS or S3 bucket.
"""
@@ -1369,6 +1382,11 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module:
if quant_method is not None:
with device_loading_context(module, target_device):
quant_method.process_weights_after_loading(module)
+ if isinstance(module, Attention) and \
+ hasattr(module, "process_weights_after_loading"):
+ # When attention modules need to process weights after
+ # currently only used by MLA
+ module.process_weights_after_loading(model_config.dtype)
return model.eval()
diff --git a/vllm/model_executor/models/deepseek_v3.py b/vllm/model_executor/models/deepseek_v3.py
index 0b44f0d062c40..f6ab53c85faa3 100644
--- a/vllm/model_executor/models/deepseek_v3.py
+++ b/vllm/model_executor/models/deepseek_v3.py
@@ -27,7 +27,7 @@
from transformers import PretrainedConfig
from vllm.attention import Attention, AttentionMetadata
-from vllm.config import CacheConfig, VllmConfig
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
from vllm.distributed import (get_pp_group,
get_tensor_model_parallel_world_size,
tensor_model_parallel_all_reduce)
@@ -333,12 +333,156 @@ def forward(
return output
+class DeepseekV3MLAAttention(nn.Module):
+ """
+ Main reference: DeepseekV2 paper, and FlashInfer Implementation
+ (https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551).
+
+ For more info see MLACommonImpl in: vllm/attention/backends/mla/utils.py
+ """
+
+ def __init__(
+ self,
+ config: PretrainedConfig,
+ hidden_size: int,
+ num_heads: int,
+ qk_nope_head_dim: int,
+ qk_rope_head_dim: int,
+ v_head_dim: int,
+ q_lora_rank: Optional[int],
+ kv_lora_rank: int,
+ rope_theta: float = 10000,
+ rope_scaling: Optional[Dict[str, Any]] = None,
+ max_position_embeddings: int = 8192,
+ cache_config: Optional[CacheConfig] = None,
+ quant_config: Optional[QuantizationConfig] = None,
+ prefix: str = "",
+ ) -> None:
+ super().__init__()
+ self.hidden_size = hidden_size
+ self.qk_nope_head_dim = qk_nope_head_dim
+ self.qk_rope_head_dim = qk_rope_head_dim
+ self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+ self.v_head_dim = v_head_dim
+
+ self.q_lora_rank = q_lora_rank
+ self.kv_lora_rank = kv_lora_rank
+
+ self.num_heads = num_heads
+ tp_size = get_tensor_model_parallel_world_size()
+ assert num_heads % tp_size == 0
+ self.num_local_heads = num_heads // tp_size
+
+ self.scaling = self.qk_head_dim**-0.5
+ self.rope_theta = rope_theta
+ self.max_position_embeddings = max_position_embeddings
+
+ if self.q_lora_rank is not None:
+ self.q_a_proj = ReplicatedLinear(self.hidden_size,
+ self.q_lora_rank,
+ bias=False,
+ quant_config=quant_config,
+ prefix=f"{prefix}.q_a_proj")
+ self.q_a_layernorm = RMSNorm(self.q_lora_rank,
+ eps=config.rms_norm_eps)
+ self.q_b_proj = ColumnParallelLinear(q_lora_rank,
+ self.num_heads *
+ self.qk_head_dim,
+ bias=False,
+ quant_config=quant_config,
+ prefix=f"{prefix}.q_b_proj")
+ else:
+ self.q_proj = ColumnParallelLinear(self.hidden_size,
+ self.num_heads *
+ self.qk_head_dim,
+ bias=False,
+ quant_config=quant_config,
+ prefix=f"{prefix}.q_proj")
+
+ self.kv_a_proj_with_mqa = ReplicatedLinear(
+ self.hidden_size,
+ self.kv_lora_rank + self.qk_rope_head_dim,
+ bias=False,
+ quant_config=quant_config,
+ prefix=f"{prefix}.kv_a_proj_with_mqa")
+ self.kv_a_layernorm = RMSNorm(self.kv_lora_rank,
+ eps=config.rms_norm_eps)
+ self.kv_b_proj = ColumnParallelLinear(
+ self.kv_lora_rank,
+ self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+ bias=False,
+ quant_config=quant_config,
+ prefix=f"{prefix}.kv_b_proj")
+ self.o_proj = RowParallelLinear(self.num_heads * self.v_head_dim,
+ self.hidden_size,
+ bias=False,
+ quant_config=quant_config,
+ prefix=f"{prefix}.o_proj")
+
+ rope_scaling["rope_type"] = 'deepseek_yarn'
+ self.rotary_emb = get_rope(qk_rope_head_dim,
+ rotary_dim=qk_rope_head_dim,
+ max_position=max_position_embeddings,
+ base=rope_theta,
+ rope_scaling=rope_scaling,
+ is_neox_style=False)
+ if rope_scaling:
+ mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
+ scaling_factor = rope_scaling["factor"]
+ mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
+ self.scaling = self.scaling * mscale * mscale
+
+ self.mla_attn = Attention(
+ num_heads=self.num_local_heads,
+ head_size=self.kv_lora_rank,
+ scale=self.scaling,
+ num_kv_heads=1,
+ cache_config=cache_config,
+ quant_config=quant_config,
+ prefix=f"{prefix}.attn",
+ use_mla=True,
+ # MLA Args
+ q_lora_rank=self.q_lora_rank,
+ kv_lora_rank=self.kv_lora_rank,
+ qk_nope_head_dim=self.qk_nope_head_dim,
+ qk_rope_head_dim=self.qk_rope_head_dim,
+ qk_head_dim=self.qk_head_dim,
+ v_head_dim=self.v_head_dim,
+ rotary_emb=self.rotary_emb,
+ q_proj=self.q_proj if self.q_lora_rank is None else self.q_b_proj,
+ kv_b_proj=self.kv_b_proj,
+ o_proj=self.o_proj,
+ )
+
+ self.prefix = prefix
+ self.debug_layer_idx = int(self.prefix.split(".")[-2])
+
+ def forward(
+ self,
+ positions: torch.Tensor,
+ hidden_states: torch.Tensor,
+ kv_cache: torch.Tensor,
+ attn_metadata: AttentionMetadata,
+ ) -> torch.Tensor:
+ if self.q_lora_rank is not None:
+ ckq = self.q_a_proj(hidden_states)[0]
+ hidden_states_or_q_c = self.q_a_layernorm(ckq)
+ else:
+ hidden_states_or_q_c = hidden_states
+ kv_c, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split(
+ [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+ kv_c_normed = self.kv_a_layernorm(kv_c.contiguous())
+ return self.mla_attn(hidden_states_or_q_c, kv_c_normed, k_pe, kv_cache,
+ attn_metadata)
+
+
class DeepseekV3DecoderLayer(nn.Module):
def __init__(
self,
config: PretrainedConfig,
prefix: str,
+ model_config: ModelConfig,
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
) -> None:
@@ -351,7 +495,11 @@ def __init__(
# DecoderLayers are created with `make_layers` which passes the prefix
# with the layer's index.
layer_idx = int(prefix.split(sep='.')[-1])
- self.self_attn = DeepseekV3Attention(
+ if model_config.use_mla:
+ attn_cls = DeepseekV3MLAAttention
+ else:
+ attn_cls = DeepseekV3Attention
+ self.self_attn = attn_cls(
config=config,
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
@@ -428,6 +576,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
config = vllm_config.model_config.hf_config
+ model_config = vllm_config.model_config
cache_config = vllm_config.cache_config
quant_config = vllm_config.quant_config
@@ -447,6 +596,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
lambda prefix: DeepseekV3DecoderLayer(
config,
prefix,
+ model_config=model_config,
cache_config=cache_config,
quant_config=quant_config,
),
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index dbdda51aedaa0..2b6557ad3ce66 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -262,8 +262,10 @@ def hash_block_tokens(
The hash value of the block and the token ids in the block.
The entire tuple is used as the hash key of the block.
"""
- return BlockHashType(hash((parent_block_hash, *curr_block_token_ids)),
- tuple(curr_block_token_ids), extra_keys)
+ curr_block_token_ids_tuple = tuple(curr_block_token_ids)
+ return BlockHashType(
+ hash((parent_block_hash, curr_block_token_ids_tuple, extra_keys)),
+ curr_block_token_ids_tuple, extra_keys)
def hash_request_tokens(block_size: int,
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index 08316ba74aad8..c427b759b2e97 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -110,7 +110,9 @@ def get_cache_block_size(
parallel_config, LayerBlockType.attention)
key_cache_block = cache_config.block_size * num_heads * head_size
- value_cache_block = key_cache_block
+ # For MLA there is no value cache, since the latent vector
+ # is joint keys and values.
+ value_cache_block = key_cache_block if not model_config.use_mla else 0
total = num_attention_layers * (key_cache_block + value_cache_block)
if cache_config.cache_dtype == "auto":
dtype = model_config.dtype