From 372bf0890b19cc3c2992ce5c16eca3647e2a9e13 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Mon, 27 Jan 2025 15:25:30 +0800 Subject: [PATCH 01/69] [Bugfix] Fix missing seq_start_loc in xformers prefill metadata (#12464) Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/attention/backends/xformers.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index 8c25dda7aad2c..49f47f9c8ded3 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -199,6 +199,8 @@ def prefill_metadata(self) -> Optional["XFormersMetadata"]: # Compute some attn_metadata fields which default to None query_start_loc = (None if self.query_start_loc is None else self.query_start_loc[:self.num_prefills + 1]) + seq_start_loc = (None if self.seq_start_loc is None else + self.seq_start_loc[:self.num_prefills + 1]) slot_mapping = (None if self.slot_mapping is None else self.slot_mapping[:self.num_prefill_tokens]) seq_lens = (None if self.seq_lens is None else @@ -225,6 +227,7 @@ def prefill_metadata(self) -> Optional["XFormersMetadata"]: max_prefill_seq_len=self.max_prefill_seq_len, max_decode_seq_len=0, query_start_loc=query_start_loc, + seq_start_loc=seq_start_loc, context_lens_tensor=context_lens_tensor, block_tables=block_tables, use_cuda_graph=False, From 624a1e4711cb9cfdd7e336980668e64744a84863 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 27 Jan 2025 01:09:27 -0800 Subject: [PATCH 02/69] [V1][Minor] Minor optimizations for update_from_output (#12454) Signed-off-by: Woosuk Kwon --- vllm/v1/core/scheduler.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index 8ded5e5787133..de7fb1a698df6 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -411,6 +411,10 @@ def update_from_output( num_scheduled_tokens = scheduler_output.num_scheduled_tokens new_running: List[Request] = [] outputs: List[EngineCoreOutput] = [] + + # NOTE(woosuk): As len(self.running) can be up to 1K or more, the below + # loop can be a performance bottleneck. We should do our best to avoid + # expensive operations inside the loop. for request in self.running: req_id = request.request_id request.num_computed_tokens += num_scheduled_tokens[req_id] @@ -421,13 +425,15 @@ def update_from_output( cached_encoder_input_ids = ( self.encoder_cache_manager.get_cached_input_ids(request)) - for input_id in list(cached_encoder_input_ids): - start_pos = request.mm_positions[input_id]["offset"] - num_tokens = request.mm_positions[input_id]["length"] - if start_pos + num_tokens <= request.num_computed_tokens: - # The encoder output is already processed and stored - # in the decoder's KV cache. - self.encoder_cache_manager.free(request, input_id) + # OPTIMIZATION: Avoid list(set) if the set is empty. + if cached_encoder_input_ids: + for input_id in list(cached_encoder_input_ids): + start_pos = request.mm_positions[input_id]["offset"] + num_tokens = request.mm_positions[input_id]["length"] + if start_pos + num_tokens <= request.num_computed_tokens: + # The encoder output is already processed and stored + # in the decoder's KV cache. + self.encoder_cache_manager.free(request, input_id) if request.num_computed_tokens == request.num_tokens: req_index = model_runner_output.req_id_to_index[req_id] From ce69f7f7542bdb8b6e6302d112fb9fad212c1460 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Mon, 27 Jan 2025 18:31:49 +0800 Subject: [PATCH 03/69] [Bugfix] Fix gpt2 GGUF inference (#12467) Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/gpt2.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index 1656a3cc9e46d..2f1aa2d68653c 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -258,13 +258,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.transformer = GPT2Model(vllm_config=vllm_config, prefix=maybe_prefix( prefix, "transformer")) + self.lm_head = ParallelLMHead(self.config.vocab_size, + self.config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.lm_head") if self.config.tie_word_embeddings: - self.lm_head = self.transformer.wte - else: - self.lm_head = ParallelLMHead(self.config.vocab_size, - self.config.hidden_size, - quant_config=quant_config, - prefix=f"{prefix}.lm_head") + self.lm_head = self.lm_head.tie_weights(self.transformer.wte) + self.logits_processor = LogitsProcessor(config.vocab_size) self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( @@ -309,15 +309,12 @@ def load_weights(self, weights: Iterable[Tuple[str, params_dict = dict(self.named_parameters(remove_duplicate=False)) loaded_params: Set[str] = set() for name, loaded_weight in weights: - if name.startswith("lm_head"): - # GPT-2 ties the weights of the embedding layer and the final - # linear layer. - continue if ".attn.bias" in name or ".attn.masked_bias" in name: # Skip attention mask. # NOTE: "c_attn.bias" should not be skipped. continue - if not name.startswith("transformer."): + if not name.startswith("transformer.") and not name.startswith( + "lm_head"): name = "transformer." + name if is_pp_missing_parameter(name, self): From 103bd17ac585b44372a47f365d80f13446cf362d Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Mon, 27 Jan 2025 10:40:00 -0500 Subject: [PATCH 04/69] [Build] Only build 9.0a for scaled_mm and sparse kernels (#12339) Signed-off-by: Lucas Wilkinson --- CMakeLists.txt | 8 ++++---- cmake/utils.cmake | 43 ++++++++++++++++++++++++++++--------------- 2 files changed, 32 insertions(+), 19 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ead539993d98c..4dee9ec36895f 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -275,7 +275,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # Only build Marlin kernels if we are building for at least some compatible archs. # Keep building Marlin for 9.0 as there are some group sizes and shapes that # are not supported by Machete yet. - cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" ${CUDA_ARCHS}) + cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}") if (MARLIN_ARCHS) set(MARLIN_SRCS "csrc/quantization/fp8/fp8_marlin.cu" @@ -296,8 +296,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") endif() # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require - # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now). - cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}") + # CUDA 12.0 or later (and only work on Hopper, 9.0a for now). + cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS}") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS) set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu") set_gencode_flags_for_srcs( @@ -351,7 +351,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # 2:4 Sparse Kernels # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor - # require CUDA 12.2 or later (and only work on Hopper, 9.0/9.0a for now). + # require CUDA 12.2 or later (and only work on Hopper, 9.0a for now). if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS) set(SRCS "csrc/sparse/cutlass/sparse_compressor_c3x.cu" "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu") diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 15b09395a889f..1c1c539819d05 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -259,7 +259,7 @@ endmacro() # in `SRC_CUDA_ARCHS` that is less or equal to the version in `TGT_CUDA_ARCHS`. # We have special handling for 9.0a, if 9.0a is in `SRC_CUDA_ARCHS` and 9.0 is # in `TGT_CUDA_ARCHS` then we should remove 9.0a from `SRC_CUDA_ARCHS` and add -# 9.0a to the result. +# 9.0a to the result (and remove 9.0 from TGT_CUDA_ARCHS). # The result is stored in `OUT_CUDA_ARCHS`. # # Example: @@ -270,34 +270,47 @@ endmacro() # function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS) list(REMOVE_DUPLICATES SRC_CUDA_ARCHS) + set(TGT_CUDA_ARCHS_ ${TGT_CUDA_ARCHS}) # if 9.0a is in SRC_CUDA_ARCHS and 9.0 is in CUDA_ARCHS then we should # remove 9.0a from SRC_CUDA_ARCHS and add 9.0a to _CUDA_ARCHS set(_CUDA_ARCHS) if ("9.0a" IN_LIST SRC_CUDA_ARCHS) list(REMOVE_ITEM SRC_CUDA_ARCHS "9.0a") - if ("9.0" IN_LIST TGT_CUDA_ARCHS) + if ("9.0" IN_LIST TGT_CUDA_ARCHS_) + list(REMOVE_ITEM TGT_CUDA_ARCHS_ "9.0") set(_CUDA_ARCHS "9.0a") endif() endif() list(SORT SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING) - # for each ARCH in CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that is - # less or eqault to ARCH - foreach(_ARCH ${CUDA_ARCHS}) - set(_TMP_ARCH) - foreach(_SRC_ARCH ${SRC_CUDA_ARCHS}) - if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH) - set(_TMP_ARCH ${_SRC_ARCH}) - else() - break() + # for each ARCH in TGT_CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that + # is less or equal to ARCH (but has the same major version since SASS binary + # compatibility is only forward compatible within the same major version). + foreach(_ARCH ${TGT_CUDA_ARCHS_}) + set(_TMP_ARCH) + # Extract the major version of the target arch + string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" TGT_ARCH_MAJOR "${_ARCH}") + foreach(_SRC_ARCH ${SRC_CUDA_ARCHS}) + # Extract the major version of the source arch + string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" SRC_ARCH_MAJOR "${_SRC_ARCH}") + # Check major-version match AND version-less-or-equal + if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH) + if (SRC_ARCH_MAJOR STREQUAL TGT_ARCH_MAJOR) + set(_TMP_ARCH "${_SRC_ARCH}") + endif() + else() + # If we hit a version greater than the target, we can break + break() + endif() + endforeach() + + # If we found a matching _TMP_ARCH, append it to _CUDA_ARCHS + if (_TMP_ARCH) + list(APPEND _CUDA_ARCHS "${_TMP_ARCH}") endif() endforeach() - if (_TMP_ARCH) - list(APPEND _CUDA_ARCHS ${_TMP_ARCH}) - endif() - endforeach() list(REMOVE_DUPLICATES _CUDA_ARCHS) set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE) From 01ba927040d0b6f7d8daf6bfbf32fde562d2f8a6 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Mon, 27 Jan 2025 17:26:28 +0000 Subject: [PATCH 05/69] [V1][Metrics] Add initial Prometheus logger (#12416) Signed-off-by: Mark McLoughlin --- tests/entrypoints/openai/test_metrics.py | 41 ++++++++++++++++++++---- vllm/v1/engine/async_llm.py | 11 ++++--- vllm/v1/metrics/loggers.py | 36 +++++++++++++++++++++ 3 files changed, 78 insertions(+), 10 deletions(-) diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 6523c8b6297c6..469a5fb039fb6 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -16,6 +16,24 @@ MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" +@pytest.fixture(scope="module", params=[True, False]) +def use_v1(request): + # Module-scoped variant of run_with_both_engines + # + # Use this fixture to run a test with both v0 and v1, and + # also to conditionalize the test logic e.g. + # + # def test_metrics_exist(use_v1, server, client): + # ... + # expected = EXPECTED_V1_METRICS if use_v1 else EXPECTED_METRICS + # for metric in expected: + # assert metric in response.text + # + # @skip_v1 wouldn't work here because this is a module-level + # fixture - per-function decorators would have no effect + yield request.param + + @pytest.fixture(scope="module") def default_server_args(): return [ @@ -36,10 +54,12 @@ def default_server_args(): "--enable-chunked-prefill", "--disable-frontend-multiprocessing", ]) -def server(default_server_args, request): +def server(use_v1, default_server_args, request): if request.param: default_server_args.append(request.param) - with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server: + env_dict = dict(VLLM_USE_V1='1' if use_v1 else '0') + with RemoteOpenAIServer(MODEL_NAME, default_server_args, + env_dict=env_dict) as remote_server: yield remote_server @@ -84,7 +104,9 @@ async def client(server): @pytest.mark.asyncio async def test_metrics_counts(server: RemoteOpenAIServer, - client: openai.AsyncClient): + client: openai.AsyncClient, use_v1: bool): + if use_v1: + pytest.skip("Skipping test on vllm V1") for _ in range(_NUM_REQUESTS): # sending a request triggers the metrics to be logged. await client.completions.create( @@ -174,10 +196,15 @@ async def test_metrics_counts(server: RemoteOpenAIServer, "swap_space_bytes", ] +EXPECTED_METRICS_V1 = [ + "vllm:num_requests_running", + "vllm:num_requests_waiting", +] + @pytest.mark.asyncio async def test_metrics_exist(server: RemoteOpenAIServer, - client: openai.AsyncClient): + client: openai.AsyncClient, use_v1: bool): # sending a request triggers the metrics to be logged. await client.completions.create(model=MODEL_NAME, prompt="Hello, my name is", @@ -187,11 +214,13 @@ async def test_metrics_exist(server: RemoteOpenAIServer, response = requests.get(server.url_for("metrics")) assert response.status_code == HTTPStatus.OK - for metric in EXPECTED_METRICS: + for metric in (EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS): assert metric in response.text -def test_metrics_exist_run_batch(): +def test_metrics_exist_run_batch(use_v1: bool): + if use_v1: + pytest.skip("Skipping test on vllm V1") input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}""" # noqa: E501 base_url = "0.0.0.0" diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 6dc68b3a16099..917d52d3220b8 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -24,7 +24,8 @@ from vllm.v1.engine.output_processor import OutputProcessor from vllm.v1.engine.processor import Processor from vllm.v1.executor.abstract import Executor -from vllm.v1.metrics.loggers import LoggingStatLogger, StatLoggerBase +from vllm.v1.metrics.loggers import (LoggingStatLogger, PrometheusStatLogger, + StatLoggerBase) from vllm.v1.metrics.stats import IterationStats, SchedulerStats logger = init_logger(__name__) @@ -46,13 +47,15 @@ def __init__( assert start_engine_loop + self.model_config = vllm_config.model_config + self.log_requests = log_requests self.log_stats = log_stats self.stat_loggers: List[StatLoggerBase] = [ LoggingStatLogger(), - # TODO(rob): PrometheusStatLogger(), + PrometheusStatLogger(labels=dict( + model_name=self.model_config.served_model_name)), ] - self.model_config = vllm_config.model_config # Tokenizer (+ ensure liveness if running in another process). self.tokenizer = init_tokenizer_from_configs( @@ -272,7 +275,7 @@ async def _run_output_handler(self): # 4) Logging. # TODO(rob): make into a coroutine and launch it in - # background thread once we add Prometheus. + # background thread once Prometheus overhead is non-trivial. assert iteration_stats is not None self._log_stats( scheduler_stats=outputs.scheduler_stats, diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 8feeef17542e6..b84f03fa3267c 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -1,5 +1,8 @@ import time from abc import ABC, abstractmethod +from typing import Dict + +import prometheus_client from vllm.logger import init_logger from vllm.v1.metrics.stats import SchedulerStats @@ -36,3 +39,36 @@ def log(self, scheduler_stats: SchedulerStats): scheduler_stats.num_running_reqs, scheduler_stats.num_waiting_reqs, ) + + +class PrometheusStatLogger(StatLoggerBase): + + def __init__(self, labels: Dict[str, str]): + self.labels = labels + + labelnames = self.labels.keys() + labelvalues = self.labels.values() + + self._unregister_vllm_metrics() + + self.gauge_scheduler_running = prometheus_client.Gauge( + name="vllm:num_requests_running", + documentation="Number of requests in model execution batches.", + labelnames=labelnames).labels(*labelvalues) + + self.gauge_scheduler_waiting = prometheus_client.Gauge( + name="vllm:num_requests_waiting", + documentation="Number of requests waiting to be processed.", + labelnames=labelnames).labels(*labelvalues) + + def log(self, scheduler_stats: SchedulerStats): + """Log to prometheus.""" + self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs) + self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs) + + @staticmethod + def _unregister_vllm_metrics(): + # Unregister any existing vLLM collectors (for CI/CD + for collector in list(prometheus_client.REGISTRY._collector_to_names): + if hasattr(collector, "_name") and "vllm" in collector._name: + prometheus_client.REGISTRY.unregister(collector) From 3f1fc7425a7db4d9722941075e43bb2ebfb90613 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 27 Jan 2025 09:40:04 -0800 Subject: [PATCH 06/69] [V1][CI/Test] Do basic test for top-p & top-k sampling (#12469) Signed-off-by: Woosuk Kwon --- tests/v1/engine/test_engine_core.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index cccfd305ac604..033bbcfce564e 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -144,7 +144,7 @@ def test_engine_core(monkeypatch): def test_engine_core_advanced_sampling(monkeypatch): """ A basic end-to-end test to verify that the engine functions correctly - when additional sampling parameters, such as min_tokens and + when additional sampling parameters, such as top_p, min_tokens, and presence_penalty, are set. """ with monkeypatch.context() as m: @@ -167,11 +167,23 @@ def test_engine_core_advanced_sampling(monkeypatch): stop_token_ids=[1001, 1002], ) engine_core.add_request(request) - assert len(engine_core.scheduler.waiting) == 1 - assert len(engine_core.scheduler.running) == 0 - # Loop through until they are all done. - while len(engine_core.step().outputs) > 0: - pass - assert len(engine_core.scheduler.waiting) == 0 - assert len(engine_core.scheduler.running) == 0 + def _check_engine_state(): + assert len(engine_core.scheduler.waiting) == 1 + assert len(engine_core.scheduler.running) == 0 + # Loop through until they are all done. + while len(engine_core.step().outputs) > 0: + pass + assert len(engine_core.scheduler.waiting) == 0 + assert len(engine_core.scheduler.running) == 0 + + _check_engine_state() + + # Second request. + request2 = make_request() + request2.sampling_params = SamplingParams( + top_p=0.99, + top_k=50, + ) + engine_core.add_request(request2) + _check_engine_state() From 2bc3fbba0cf5b07fabb798d41b153b895d30c7b4 Mon Sep 17 00:00:00 2001 From: Bowen Wang Date: Tue, 28 Jan 2025 02:19:24 +0800 Subject: [PATCH 07/69] [FlashInfer] Upgrade to 0.2.0 (#11194) Signed-off-by: Bowen Wang Signed-off-by: youkaichao Co-authored-by: youkaichao --- .buildkite/test-pipeline.yaml | 11 +- Dockerfile | 23 ++- .../test_basic_correctness.py | 5 +- tests/compile/test_basic_correctness.py | 2 +- tests/kernels/test_flashinfer.py | 74 +++---- vllm/attention/backends/flashinfer.py | 183 ++++++++++++++++-- vllm/config.py | 10 +- vllm/model_executor/model_loader/loader.py | 4 +- .../model_executor/model_loader/tensorizer.py | 3 +- vllm/worker/worker_base.py | 17 +- 10 files changed, 257 insertions(+), 75 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index daec46760117d..d5d02fdeb7f4b 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -183,7 +183,16 @@ steps: - vllm/ - tests/v1 commands: - - VLLM_USE_V1=1 pytest -v -s v1 + # split the test to avoid interference + - VLLM_USE_V1=1 pytest -v -s v1/core + - VLLM_USE_V1=1 pytest -v -s v1/engine + - VLLM_USE_V1=1 pytest -v -s v1/sample + - VLLM_USE_V1=1 pytest -v -s v1/worker + - VLLM_USE_V1=1 pytest -v -s v1/test_stats.py + - VLLM_USE_V1=1 pytest -v -s v1/test_utils.py + # TODO: accuracy does not match, whether setting + # VLLM_USE_FLASHINFER_SAMPLER or not on H100. + - VLLM_USE_V1=1 pytest -v -s v1/e2e - label: Examples Test # 25min working_dir: "/vllm-workspace/examples" diff --git a/Dockerfile b/Dockerfile index cb9cf0da5be65..0b9f74e08dc68 100644 --- a/Dockerfile +++ b/Dockerfile @@ -149,7 +149,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \ #################### vLLM installation IMAGE #################### # image with vLLM installed -FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base +# TODO: Restore to base image after FlashInfer AOT wheel fixed +FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base ARG CUDA_VERSION=12.4.1 ARG PYTHON_VERSION=3.12 WORKDIR /vllm-workspace @@ -194,12 +195,30 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install dist/*.whl --verbose +# How to build this FlashInfer wheel: +# $ export FLASHINFER_ENABLE_AOT=1 +# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+ +# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX' +# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive +# $ cd flashinfer +# $ git checkout 524304395bd1d8cd7d07db083859523fcaa246a4 +# $ python3 setup.py bdist_wheel --dist-dir=dist --verbose + RUN --mount=type=cache,target=/root/.cache/pip \ . /etc/environment && \ if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \ - python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \ + python3 -m pip install https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.0.post1-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \ fi COPY examples examples + +# Although we build Flashinfer with AOT mode, there's still +# some issues w.r.t. JIT compilation. Therefore we need to +# install build dependencies for JIT compilation. +# TODO: Remove this once FlashInfer AOT wheel is fixed +COPY requirements-build.txt requirements-build.txt +RUN --mount=type=cache,target=/root/.cache/pip \ + python3 -m pip install -r requirements-build.txt + #################### vLLM installation IMAGE #################### #################### TEST IMAGE #################### diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 31a101e48e026..23285040642a8 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -61,9 +61,10 @@ def test_models( if backend == "FLASHINFER" and current_platform.is_rocm(): pytest.skip("Flashinfer does not support ROCm/HIP.") - if backend == "XFORMERS" and model == "google/gemma-2-2b-it": + if backend in ("XFORMERS", + "FLASHINFER") and model == "google/gemma-2-2b-it": pytest.skip( - "XFORMERS does not support gemma2 with full context length.") + f"{backend} does not support gemma2 with full context length.") os.environ["VLLM_ATTENTION_BACKEND"] = backend diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index 87d5aefea6cb4..1945479fc3031 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -58,7 +58,7 @@ class TestSetting: model_args=["--task", "embed"], pp_size=1, tp_size=1, - attn_backend="FLASHINFER", + attn_backend="FLASH_ATTN", method="encode", fullgraph=True, ), diff --git a/tests/kernels/test_flashinfer.py b/tests/kernels/test_flashinfer.py index a2c8f71665737..1645ef911d697 100644 --- a/tests/kernels/test_flashinfer.py +++ b/tests/kernels/test_flashinfer.py @@ -133,17 +133,19 @@ def test_flashinfer_decode_with_paged_kv( use_tensor_cores=( (num_query_heads//num_kv_heads) > 4) ) - wrapper.begin_forward(kv_indptr, - kv_indices, - kv_last_page_lens, - num_query_heads, - num_kv_heads, - head_size, - block_size, - "NONE", - data_type=dtype) - - output = wrapper.forward(query, key_value_cache, logits_soft_cap=soft_cap) + wrapper.plan(kv_indptr, + kv_indices, + kv_last_page_lens, + num_query_heads, + num_kv_heads, + head_size, + block_size, + "NONE", + q_data_type=dtype, + kv_data_type=dtype, + logits_soft_cap=soft_cap) + + output = wrapper.run(query, key_value_cache) ref_output = ref_paged_attn(query=query, key_cache=key_cache, @@ -228,7 +230,7 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]], workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8) wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper( workspace_buffer, "NHD") - wrapper.begin_forward( + wrapper.plan( qo_indptr, kv_indptr, kv_indices, @@ -237,12 +239,14 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]], num_kv_heads, head_size, block_size, + q_data_type=dtype, + kv_data_type=dtype, + logits_soft_cap=soft_cap, ) - output = wrapper.forward( + output = wrapper.run( query, key_value_cache, - logits_soft_cap=soft_cap, ) ref_output = ref_paged_attn(query=query, @@ -253,7 +257,7 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]], block_tables=block_tables, scale=scale, soft_cap=soft_cap) - torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2), \ + torch.testing.assert_close(output, ref_output, atol=5e-2, rtol=1e-2), \ f"{torch.max(torch.abs(output - ref_output))}" @@ -332,7 +336,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv( workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8) wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper( workspace_buffer, "NHD") - wrapper.begin_forward( + wrapper.plan( qo_indptr, kv_indptr, kv_indices, @@ -341,13 +345,12 @@ def test_flashinfer_prefill_with_paged_fp8_kv( num_kv_heads, head_size, block_size, + q_data_type=dtype, + kv_data_type=kv_cache_dtype, + logits_soft_cap=soft_cap, ) - output = wrapper.forward(query, - kv_cache_fp8, - logits_soft_cap=soft_cap, - k_scale=k_scale, - v_scale=v_scale) + output = wrapper.run(query, kv_cache_fp8, k_scale=k_scale, v_scale=v_scale) ref_output = ref_paged_attn(query=query, key_cache=key_cache.squeeze(1), @@ -360,7 +363,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv( del query del block_tables # verify prefill fp8 - torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2), \ + torch.testing.assert_close(output, ref_output, atol=5e-2, rtol=1e-2), \ f"{torch.max(torch.abs(output - ref_output))}" @@ -439,21 +442,18 @@ def test_flashinfer_decode_with_paged_fp8_kv( wrapper = flashinfer.\ BatchDecodeWithPagedKVCacheWrapper(workspace_buffer, "NHD", use_tensor_cores=use_tensor_cores) - wrapper.begin_forward(kv_indptr, - kv_indices, - kv_last_page_lens, - num_query_heads, - num_kv_heads, - head_size, - block_size, - "NONE", - data_type=dtype, - q_data_type=dtype) - output = wrapper.forward(query, - kv_cache_fp8, - logits_soft_cap=soft_cap, - k_scale=k_scale, - v_scale=v_scale) + wrapper.plan(kv_indptr, + kv_indices, + kv_last_page_lens, + num_query_heads, + num_kv_heads, + head_size, + block_size, + "NONE", + q_data_type=dtype, + kv_data_type=kv_cache_dtype, + logits_soft_cap=soft_cap) + output = wrapper.run(query, kv_cache_fp8, k_scale=k_scale, v_scale=v_scale) key_cache = key_value_cache[:, 0, :, :, :].squeeze(1) value_cache = key_value_cache[:, 1, :, :, :].squeeze(1) diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index 3135b0b405343..7cccef9608218 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -1,3 +1,4 @@ +import dataclasses from collections import defaultdict from contextlib import contextmanager from dataclasses import dataclass @@ -13,9 +14,11 @@ from vllm.vllm_flash_attn import flash_attn_varlen_func FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024 except ImportError: - BatchDecodeWithPagedKVCacheWrapper = None - CUDAGraphBatchDecodeWithPagedKVCacheWrapper = None - BatchPrefillWithPagedKVCacheWrapper = None + # Avoid turning these types into variables during type checking + if not TYPE_CHECKING: + BatchDecodeWithPagedKVCacheWrapper = None + CUDAGraphBatchDecodeWithPagedKVCacheWrapper = None + BatchPrefillWithPagedKVCacheWrapper = None FLASHINFER_WORKSPACE_BUFFER_SIZE = 0 import torch @@ -30,7 +33,9 @@ from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping, compute_slot_mapping_start_idx, is_block_tables_empty) +from vllm.attention.layer import Attention from vllm.attention.ops.paged_attn import PagedAttention +from vllm.config import VllmConfig, get_current_vllm_config from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype, make_tensor_with_pad) @@ -99,6 +104,72 @@ def get_fp8_dtype_for_flashinfer(kv_cache_dtype: str) -> torch.dtype: raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}") +@dataclass +class PerLayerParameters: + """ + Currently, FlashInfer backend only support models in which all layers share + the same values for the following hyperparameters. + """ + + window_left: int + logits_soft_cap: Optional[float] + sm_scale: float + + +def get_per_layer_parameters( + vllm_config: VllmConfig) -> Dict[str, PerLayerParameters]: + """ + Scan all attention layers and determine some hyperparameters + to use during `plan`. + """ + + layers = vllm_config.compilation_config.static_forward_context + per_layer_params: Dict[str, PerLayerParameters] = {} + + for key, layer in layers.items(): + assert isinstance(layer, Attention) + + impl = layer.impl + assert isinstance(impl, FlashInferImpl) + + # Infer hyperparameters from the attention layer + window_size = impl.sliding_window + window_left = window_size[0] if window_size is not None else -1 + logits_soft_cap = impl.logits_soft_cap + sm_scale = impl.scale + + per_layer_params[key] = PerLayerParameters(window_left, + logits_soft_cap, sm_scale) + + return per_layer_params + + +def infer_global_hyperparameters( + per_layer_params: Dict[str, PerLayerParameters]) -> PerLayerParameters: + """ + Currently, FlashInfer backend only support models in which all layers share + the same values for the following hyperparameters: + - `window_left` + - `logits_soft_cap` + - `sm_scale` + + So this function asserts that all layers share the same values for these + hyperparameters and returns the global values. + """ + + assert len(per_layer_params) > 0, "No attention layers found in the model." + + param_sets = list(per_layer_params.values()) + global_params = param_sets[0] + for params in param_sets: + assert params == global_params, ( + "FlashInfer backend currently only supports models in which all " + "layers share the same values for the following hyperparameters: " + "`window_left`, `logits_soft_cap`, `sm_scale`.") + + return global_params + + class FlashInferState(AttentionState): def __init__(self, runner): @@ -108,6 +179,11 @@ def __init__(self, runner): self._decode_wrapper = None self._prefill_wrapper = None + # Global hyperparameters shared by all attention layers + self.global_hyperparameters: Optional[PerLayerParameters] = None + + self.vllm_config = get_current_vllm_config() + def _get_workspace_buffer(self): if self._workspace_buffer is None: self._workspace_buffer = torch.empty( @@ -215,6 +291,9 @@ def graph_capture_get_metadata_for_batch( batch_size + 1, dtype=torch.int32) + global_params = infer_global_hyperparameters( + get_per_layer_parameters(self.vllm_config)) + attn_metadata = self.runner.attn_backend.make_metadata( num_prefills=0, slot_mapping=self._graph_slot_mapping[:batch_size], @@ -238,7 +317,9 @@ def graph_capture_get_metadata_for_batch( q_data_type=self.runner.model_config.dtype, use_cuda_graph=True, decode_wrapper=self._graph_decode_wrapper, - prefill_wrapper=None) + prefill_wrapper=None, + **dataclasses.asdict(global_params), + ) attn_metadata.begin_forward() return attn_metadata @@ -325,9 +406,28 @@ class FlashInferMetadata(AttentionMetadata): data_type: torch.dtype = None # The data type of the query q_data_type: torch.dtype = None - device: torch.device = torch.device("cuda") + # FlashInfer 0.2 encourages passing host tensors + device: torch.device = torch.device("cpu") is_profile_run: bool = False + # The FlashInfer backend currently supports only models in which all layers + # share the same following hyperparameters: + + # The left (inclusive) window size for the attention window, when + # set to `-1`, the window size will be set to the full length of + # the sequence. Defaults to `-1`. + window_left: int = -1 + # The attention logits soft capping value (used in Gemini, Grok and + # Gemma-2, etc.), if not provided, will be set to `0`. If greater + # than 0, the logits will be capped according to formula: + # $$\texttt{logits\_soft\_cap} \times + # \mathrm{tanh}(x / \texttt{logits\_soft\_cap})$$, + # where $x$ is the input logits. + logits_soft_cap: Optional[float] = None + # The scale used in softmax, if not provided, will be set to + # `1.0 / sqrt(head_dim)`. + sm_scale: Optional[float] = None + def __post_init__(self): # Refer to # https://github.com/flashinfer-ai/flashinfer/blob/3d55c71a62052c590c130897d3a3db49b14fcc34/include/flashinfer/utils.cuh#L157 @@ -363,14 +463,21 @@ def begin_forward(self): self.block_table_bound = self.block_table_bound.to(self.device) self.seq_lens_tensor = self.seq_lens_tensor.to(self.device) self.paged_kv_indices = self.paged_kv_indices.to(self.device) - self.prefill_wrapper.end_forward() - self.prefill_wrapper.begin_forward( + self.prefill_wrapper.plan( self.query_start_loc, self.paged_kv_indptr[:self.num_prefills + 1], self.paged_kv_indices, self.paged_kv_last_page_len[:self.num_prefills], - self.num_qo_heads, self.num_kv_heads, self.head_dim, - self.page_size) + self.num_qo_heads, + self.num_kv_heads, + self.head_dim, + self.page_size, + causal=True, + sm_scale=self.sm_scale, + window_left=self.window_left, + logits_soft_cap=self.logits_soft_cap, + q_data_type=self.q_data_type, + kv_data_type=self.data_type) if self.num_decode_tokens > 0: assert self.paged_kv_indices is not None assert self.paged_kv_indptr is not None @@ -386,8 +493,7 @@ def begin_forward(self): self.seq_lens_tensor = self.seq_lens_tensor.to(self.device) assert self.decode_wrapper is not None - self.decode_wrapper.end_forward() - self.decode_wrapper.begin_forward( + self.decode_wrapper.plan( self.paged_kv_indptr[self.num_prefills:], self.paged_kv_indices, self.paged_kv_last_page_len[self.num_prefills:], @@ -397,8 +503,11 @@ def begin_forward(self): self.page_size, # Disable flashinfer's pos encoding and use vllm's rope. pos_encoding_mode="NONE", + window_left=self.window_left, + logits_soft_cap=self.logits_soft_cap, + sm_scale=self.sm_scale, # kv-cache data type. - data_type=self.data_type, + kv_data_type=self.data_type, # query data type. q_data_type=self.q_data_type) @@ -496,6 +605,11 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"): self.sliding_window = input_builder.sliding_window self.block_size = input_builder.block_size + # Global hyperparameters shared by all attention layers + self.global_hyperparameters: Optional[PerLayerParameters] = None + + self.vllm_config = get_current_vllm_config() + def prepare(self): self.slot_mapping: List[int] = [] self.prefill_seq_lens: List[int] = [] @@ -528,6 +642,20 @@ def prepare(self): self.total_blocks = 0 self.is_profile_run: bool = False + if self.global_hyperparameters is None: + # Infer global hyperparameters, since currently we only support + # models in which all layers share the same values for the + # following hyperparameters: + # - `window_left` + # - `logits_soft_cap` + # - `sm_scale` + inferred_params = infer_global_hyperparameters( + get_per_layer_parameters(self.vllm_config)) + self.global_hyperparameters = inferred_params + self.window_left = inferred_params.window_left + self.logits_soft_cap = inferred_params.logits_soft_cap + self.sm_scale = inferred_params.sm_scale + def _add_seq_group( self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup", chunked_prefill_enabled: bool): @@ -756,7 +884,11 @@ def build(self, seq_lens: List[int], query_lens: List[int], data_type=kv_cache_dtype, q_data_type=self.runner.model_config.dtype, use_cuda_graph=use_captured_graph, - is_profile_run=self.is_profile_run) + is_profile_run=self.is_profile_run, + window_left=self.window_left, + logits_soft_cap=self.logits_soft_cap, + sm_scale=self.sm_scale, + ) class FlashInferImpl(AttentionImpl): @@ -885,25 +1017,34 @@ def forward( else: assert prefill_meta is not None assert prefill_meta.prefill_wrapper is not None - prefill_output = prefill_meta.prefill_wrapper.forward( + + assert prefill_meta.prefill_wrapper._causal + assert prefill_meta.prefill_wrapper._window_left == window_left + assert prefill_meta.prefill_wrapper._logits_soft_cap == ( + logits_soft_cap or 0.0) + assert prefill_meta.prefill_wrapper._sm_scale == softmax_scale + + prefill_output = prefill_meta.prefill_wrapper.run( query, kv_cache, - logits_soft_cap=logits_soft_cap, - causal=True, k_scale=layer._k_scale_float, v_scale=layer._v_scale_float, - window_left=window_left) + ) if decode_meta := attn_metadata.decode_metadata: assert decode_meta is not None assert decode_meta.decode_wrapper is not None - decode_output = decode_meta.decode_wrapper.forward( + + assert decode_meta.decode_wrapper._window_left == window_left + assert decode_meta.decode_wrapper._logits_soft_cap == ( + logits_soft_cap or 0.0) + assert decode_meta.decode_wrapper._sm_scale == softmax_scale + + decode_output = decode_meta.decode_wrapper.run( decode_query, kv_cache, - sm_scale=softmax_scale, - logits_soft_cap=logits_soft_cap, k_scale=layer._k_scale_float, v_scale=layer._v_scale_float, - window_left=window_left) + ) if prefill_output is None and decode_output is not None: # Decode only batch. diff --git a/vllm/config.py b/vllm/config.py index 7a58d64bcc6e2..dc1d611115489 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -310,14 +310,15 @@ def __init__( (self.hf_text_config.model_type in ["gemma2", "cohere2"])) if (not self.disable_sliding_window and has_interleaved_attention): - if envs.VLLM_ATTENTION_BACKEND == "XFORMERS": + if (backend := + envs.VLLM_ATTENTION_BACKEND) in ("XFORMERS", "FLASHINFER"): sliding_window_len_min = get_min_sliding_window( self.hf_text_config.sliding_window) logger.warning_once( f"{self.hf_text_config.model_type} has interleaved " "attention, which is currently not supported by the " - "XFORMERS backend. Disabling sliding window and capping " + f"{backend} backend. Disabling sliding window and capping " "the max length to the sliding window size " f"({sliding_window_len_min}).") self.disable_sliding_window = True @@ -3310,7 +3311,7 @@ def __str__(self): @contextmanager -def set_current_vllm_config(vllm_config: VllmConfig): +def set_current_vllm_config(vllm_config: VllmConfig, check_compile=False): """ Temporarily set the current VLLM config. Used during model initialization. @@ -3330,7 +3331,8 @@ def set_current_vllm_config(vllm_config: VllmConfig): vllm_config.compilation_config.enabled_custom_ops) logger.debug("disabled custom ops: %s", vllm_config.compilation_config.disabled_custom_ops) - if vllm_config.compilation_config.level == CompilationLevel.PIECEWISE \ + if check_compile and \ + vllm_config.compilation_config.level == CompilationLevel.PIECEWISE \ and compilation_counter.num_models_seen == num_models_seen: # If the model supports compilation, # compilation_counter.num_models_seen should be increased diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index e9779878710ee..527b4307f3670 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -114,7 +114,7 @@ def _initialize_model( all_params = [param.name for param in signatures.parameters.values()] if "vllm_config" in all_params and "prefix" in all_params: # new-style model class - with set_current_vllm_config(vllm_config): + with set_current_vllm_config(vllm_config, check_compile=True): return model_class(vllm_config=vllm_config, prefix=prefix) msg = ("vLLM model class should accept `vllm_config` and `prefix` as " @@ -142,7 +142,7 @@ def _initialize_model( kwargs["lora_config"] = vllm_config.lora_config if "scheduler_config" in all_params: kwargs["scheduler_config"] = vllm_config.scheduler_config - with set_current_vllm_config(vllm_config): + with set_current_vllm_config(vllm_config, check_compile=True): return model_class(**kwargs) diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index 5b4757072353f..e359aef9dcb7f 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -288,7 +288,8 @@ def _init_model(self): model_args.torch_dtype = self.tensorizer_config.dtype assert self.tensorizer_config.model_class is not None # TODO: Do we need to consider old-style model class? - with no_init_or_tensor(), set_current_vllm_config(self.vllm_config): + with no_init_or_tensor(), set_current_vllm_config(self.vllm_config, + check_compile=True): return self.tensorizer_config.model_class( vllm_config=self.vllm_config, ) diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index c6e6693c54f57..6eeb4aa17051f 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -8,7 +8,8 @@ import torch import torch.nn as nn -from vllm.config import ObservabilityConfig, VllmConfig +from vllm.config import (ObservabilityConfig, VllmConfig, + set_current_vllm_config) from vllm.distributed import broadcast_tensor_dict, get_pp_group, get_tp_group from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -498,8 +499,11 @@ def __init__( group. """ self.rpc_rank = rpc_rank - self.vllm_config = vllm_config self.worker: Optional[WorkerBase] = None + # do not store this `vllm_config`, `init_worker` will set the final + # one. TODO: investigate if we can remove this field in + # `WorkerWrapperBase`, `init_cached_hf_modules` should be + # unnecessary now. if vllm_config.model_config is not None: # it can be None in tests trust_remote_code = vllm_config.model_config.trust_remote_code @@ -533,6 +537,9 @@ def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None: Arguments are passed to the worker class constructor. """ kwargs = all_kwargs[self.rpc_rank] + self.vllm_config = kwargs.get("vllm_config", None) + assert self.vllm_config is not None, ( + "vllm_config is required to initialize the worker") enable_trace_function_call_for_thread(self.vllm_config) from vllm.plugins import load_general_plugins @@ -546,8 +553,10 @@ def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None: bytes) worker_class = cloudpickle.loads( self.vllm_config.parallel_config.worker_cls) - self.worker = worker_class(**kwargs) - assert self.worker is not None + with set_current_vllm_config(self.vllm_config): + # To make vLLM config available during worker initialization + self.worker = worker_class(**kwargs) + assert self.worker is not None def execute_method(self, method: Union[str, bytes], *args, **kwargs): try: From 6116ca8cd79b642c64f4ae6f050a6bc12b96d037 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Mon, 27 Jan 2025 22:38:35 +0100 Subject: [PATCH 08/69] [Feature] [Spec decode]: Enable MLPSpeculator/Medusa and `prompt_logprobs` with ChunkedPrefill (#10132) Signed-off-by: NickLucche Signed-off-by: wallashss Co-authored-by: wallashss --- tests/spec_decode/e2e/conftest.py | 19 +- .../e2e/test_integration_dist_tp2.py | 10 +- tests/spec_decode/e2e/test_logprobs.py | 16 +- .../e2e/test_medusa_correctness.py | 31 ++- tests/spec_decode/e2e/test_mlp_correctness.py | 53 ++++- .../e2e/test_multistep_correctness.py | 31 +-- .../spec_decode/e2e/test_ngram_correctness.py | 13 +- tests/spec_decode/test_scorer.py | 1 + tests/spec_decode/test_spec_decode_worker.py | 1 + tests/spec_decode/utils.py | 12 + vllm/config.py | 9 +- vllm/engine/llm_engine.py | 19 +- vllm/spec_decode/batch_expansion.py | 133 +++++++---- vllm/spec_decode/interfaces.py | 8 +- vllm/spec_decode/mqa_scorer.py | 68 +++++- vllm/spec_decode/spec_decode_worker.py | 211 +++++++++++++----- 16 files changed, 469 insertions(+), 166 deletions(-) diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py index b9cb3858c0068..5cb982a0811c7 100644 --- a/tests/spec_decode/e2e/conftest.py +++ b/tests/spec_decode/e2e/conftest.py @@ -2,6 +2,7 @@ from typing import List, Optional, Sequence, Tuple, Union import pytest +import torch from vllm import LLM, SamplingParams from vllm.distributed import cleanup_dist_env_and_memory @@ -154,6 +155,8 @@ def _check_logprobs_when_output_disabled( spec_pos_logprob) = next(iter(spec_pos_logprobs.items())) assert spec_pos_logprob.rank == -1 assert spec_pos_logprob.logprob == 0.0 + if isinstance(spec_pos_logprob_token_id, torch.Tensor): + spec_pos_logprob_token_id = spec_pos_logprob_token_id.item() assert spec_pos_logprob_token_id in baseline_pos_logprobs @@ -244,7 +247,8 @@ def run_equality_correctness_test_tp(model, batch_size: int, max_output_len: int, seed: int = 0, - temperature: float = 0.0): + temperature: float = 0.0, + logprobs: Optional[int] = None): """Helper method that compares the outputs of both the baseline LLM and the test LLM. It asserts greedy equality, e.g. that the outputs are exactly the same when temperature is zero. @@ -257,7 +261,6 @@ def run_equality_correctness_test_tp(model, results = [] prompts = [prompt for prompt, _ in zip(cycle(PROMPTS), range(batch_size))] - for args, env in ((arg1, env1), (arg2, env2)): with RemoteOpenAIServer(model, args, @@ -269,12 +272,14 @@ def run_equality_correctness_test_tp(model, prompt=prompts, max_tokens=max_output_len, seed=seed, - temperature=temperature) + temperature=temperature, + logprobs=logprobs) results.append({ "test": "seeded_sampling", "text": [choice.text for choice in completion.choices], + "logprobs": [choice.logprobs for choice in completion.choices], "finish_reason": [choice.finish_reason for choice in completion.choices], "usage": @@ -284,7 +289,15 @@ def run_equality_correctness_test_tp(model, n = len(results) // 2 arg1_results = results[:n] arg2_results = results[n:] + # Separate logprobs to avoid asserting exact equality. + arg1_logprobs = [r.pop("logprobs") for r in arg1_results] + arg2_logprobs = [r.pop("logprobs") for r in arg2_results] + for arg1_result, arg2_result in zip(arg1_results, arg2_results): assert arg1_result == arg2_result, ( f"Results for {model=} are not the same with {arg1=} and {arg2=}. " f"{arg1_result=} != {arg2_result=}") + if logprobs: + for logs1, logs2 in zip(arg1_logprobs, arg2_logprobs): + for l1, l2 in zip(logs1, logs2): + assert l1.tokens == l2.tokens diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py index 02cba92795142..7001ee4c007fe 100644 --- a/tests/spec_decode/e2e/test_integration_dist_tp2.py +++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py @@ -2,6 +2,8 @@ tensor parallelism. """ +from typing import Optional + import pytest import torch @@ -154,15 +156,20 @@ def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs, "--speculative-draft-tensor-parallel-size", "1", ])]) +@pytest.mark.parametrize("logprobs", [None, 2]) @pytest.mark.parametrize("batch_size", [2]) @pytest.mark.parametrize("seed", [1]) def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, + logprobs: Optional[int], batch_size: int, seed: int): """Verify spec decode works well with same and different TP size for the draft model with chunked prefill. """ + if logprobs: + test_llm_kwargs.extend( + ["--disable_logprobs_during_spec_decoding", "False"]) run_equality_correctness_test_tp(model, common_llm_kwargs, per_test_common_llm_kwargs, @@ -171,4 +178,5 @@ def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs, batch_size, max_output_len=32, seed=seed, - temperature=0.0) + temperature=0.0, + logprobs=logprobs) diff --git a/tests/spec_decode/e2e/test_logprobs.py b/tests/spec_decode/e2e/test_logprobs.py index 4cfca8b78e79b..1a543606cb3f3 100644 --- a/tests/spec_decode/e2e/test_logprobs.py +++ b/tests/spec_decode/e2e/test_logprobs.py @@ -4,26 +4,27 @@ from vllm import SamplingParams +from ..utils import maybe_enable_chunked_prefill from .conftest import run_equality_correctness_test @pytest.mark.parametrize( "common_llm_kwargs", [{ - "model_name": "JackFram/llama-68m", + "model_name": "JackFram/llama-160m", # Skip cuda graph recording for fast test. - "enforce_eager": True, + "enforce_eager": True }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("test_llm_kwargs", [{ - "speculative_model": "JackFram/llama-160m", + "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 3, "disable_logprobs_during_spec_decoding": False, }, { - "speculative_model": "JackFram/llama-160m", + "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 3, "disable_logprobs_during_spec_decoding": True, }]) @@ -36,12 +37,15 @@ ]) @pytest.mark.parametrize("seed", [1]) @pytest.mark.parametrize("logprobs", [1, 6]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 4, 12]) def test_logprobs_equality(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int, logprobs: int): - """Verify output logprobs are equal with and without speculative decoding. + seed: int, logprobs: int, prefill_chunk_size: int): + """Verify output logprobs are equal with and without speculative decoding, + as well as with and without chunked prefill. """ + maybe_enable_chunked_prefill(prefill_chunk_size, common_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, diff --git a/tests/spec_decode/e2e/test_medusa_correctness.py b/tests/spec_decode/e2e/test_medusa_correctness.py index b8965606b3d0e..dbcbc0db10881 100644 --- a/tests/spec_decode/e2e/test_medusa_correctness.py +++ b/tests/spec_decode/e2e/test_medusa_correctness.py @@ -21,6 +21,7 @@ import pytest +from ..utils import maybe_enable_chunked_prefill from .conftest import run_equality_correctness_test # main model @@ -67,12 +68,14 @@ ]) @pytest.mark.parametrize("batch_size", [1, 32]) @pytest.mark.parametrize("seed", [1]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) def test_medusa_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): + seed: int, prefill_chunk_size: int): """Verify greedy equality with different batch size.""" + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -119,12 +122,15 @@ def test_medusa_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, @pytest.mark.parametrize("batch_size", [8]) @pytest.mark.parametrize("seed", [1]) @pytest.mark.parametrize("logprobs", [1, 6]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int, logprobs: int): + seed: int, logprobs: int, + prefill_chunk_size: int): """Verify greedy equality with different batch size.""" + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -167,12 +173,14 @@ def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, ]) @pytest.mark.parametrize("batch_size", [1, 32]) @pytest.mark.parametrize("seed", [1]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) def test_medusa_e2e_greedy_correctness_cuda_graph( vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): + seed: int, prefill_chunk_size: int): """Verify greedy equality with cuda graph enabled and different batch sizes.""" + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -217,13 +225,15 @@ def test_medusa_e2e_greedy_correctness_cuda_graph( ]) @pytest.mark.parametrize("batch_size", [4]) @pytest.mark.parametrize("seed", [1]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) def test_medusa_e2e_greedy_correctness_with_preemption( vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): + seed: int, prefill_chunk_size: int): """Verify greedy equality, even when some sequences are preempted mid- generation. """ + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -267,13 +277,15 @@ def test_medusa_e2e_greedy_correctness_with_preemption( 32, ]) @pytest.mark.parametrize("seed", [1]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) def test_medusa_different_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): + seed: int, prefill_chunk_size: int): """Verify that medusa speculative decoding produces exact equality to without spec decode with different values of num_speculative_tokens. """ + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -313,14 +325,17 @@ def test_medusa_different_k(vllm_runner, common_llm_kwargs, 32, ]) @pytest.mark.parametrize("seed", [1]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) def test_medusa_disable_queue(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, - output_len: int, seed: int): + output_len: int, seed: int, + prefill_chunk_size: int): """Verify that medusa speculative decoding produces exact equality to without spec decode when speculation is disabled for large batch sizes. """ + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -361,12 +376,14 @@ def test_medusa_disable_queue(vllm_runner, common_llm_kwargs, 32, ]) @pytest.mark.parametrize("seed", [1]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, - output_len: int, seed: int): + output_len: int, seed: int, prefill_chunk_size: int): """Verify that speculative decoding generates the same output with batch expansion scorer and mqa scorer. """ + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py index 183ff2f5db274..1fa1104f5d3a8 100644 --- a/tests/spec_decode/e2e/test_mlp_correctness.py +++ b/tests/spec_decode/e2e/test_mlp_correctness.py @@ -25,6 +25,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import pad_vocab_size +from ..utils import maybe_enable_chunked_prefill from .conftest import run_equality_correctness_test # main model @@ -66,14 +67,16 @@ @pytest.mark.parametrize("output_len", [ 128, ]) -@pytest.mark.parametrize("batch_size", [1, 32]) +@pytest.mark.parametrize("batch_size", [4, 32]) @pytest.mark.parametrize("seed", [1]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) def test_mlp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): + seed: int, prefill_chunk_size: int): """Verify greedy equality with different batch size.""" + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -116,12 +119,19 @@ def test_mlp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, @pytest.mark.parametrize("batch_size", [8]) @pytest.mark.parametrize("seed", [1]) @pytest.mark.parametrize("logprobs", [1, 6]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, seed: int, - logprobs: int): + logprobs: int, prefill_chunk_size: int): """Verify greedy equality with different batch size.""" + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) + # NOTE Test is sensitive enough st if we don't enable chunked prefill + # scheduling on baseline too, we get slightly different logprobs, ending + # up sampling different tokens at the tail (ie top tokens don't change). + # TL;DR: sd+cp == org+cp but sd+cp != org..is this expected? + maybe_enable_chunked_prefill(prefill_chunk_size, baseline_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -162,12 +172,15 @@ def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, @pytest.mark.parametrize("output_len", [2048]) @pytest.mark.parametrize("batch_size", [1, 32]) @pytest.mark.parametrize("seed", [1]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) def test_mlp_e2e_acceptance_rate(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, - batch_size: int, output_len: int, seed: int): + batch_size: int, output_len: int, + prefill_chunk_size: int, seed: int): """Verify acceptance rate with different batch size and large output length.""" + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -204,13 +217,17 @@ def test_mlp_e2e_acceptance_rate(vllm_runner, common_llm_kwargs, @pytest.mark.parametrize("output_len", [64]) @pytest.mark.parametrize("batch_size", [1, 32]) @pytest.mark.parametrize("temperature", [1.0]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) @pytest.mark.parametrize("seed", [1]) def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - temperature: float, seed: int): + temperature: float, + prefill_chunk_size: int, seed: int): """Verify seeded runs produce the same output.""" + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) + maybe_enable_chunked_prefill(prefill_chunk_size, baseline_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -266,14 +283,16 @@ def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs, 128, ]) @pytest.mark.parametrize("batch_size", [4]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) @pytest.mark.parametrize("seed", [1]) def test_mlp_e2e_greedy_correctness_with_preemption( vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): + prefill_chunk_size: int, seed: int): """Verify greedy equality, even when some sequences are preempted mid- generation. """ + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -317,12 +336,14 @@ def test_mlp_e2e_greedy_correctness_with_preemption( ]) @pytest.mark.parametrize("batch_size", [4]) @pytest.mark.parametrize("seed", [1]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) def test_mlp_e2e_greedy_correctness_with_padding( vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): + prefill_chunk_size: int, seed: int): """Verify greedy equality when the vocab dimension is padded """ + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) # Default pad_to is 64, test model has vocab_size of 32000 def patched_pad_vocab_size(vocab_size, pad_to=None): @@ -373,14 +394,16 @@ def patched_pad_vocab_size(vocab_size, pad_to=None): # Use smaller output len for fast test. 32, ]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) @pytest.mark.parametrize("seed", [1]) def test_mlp_different_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, seed: int, - output_len: int): + test_llm_kwargs, batch_size: int, + prefill_chunk_size: int, seed: int, output_len: int): """Verify that mlp speculative decoding produces exact equality to without spec decode with different values of num_speculative_tokens. """ + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -418,15 +441,21 @@ def test_mlp_different_k(vllm_runner, common_llm_kwargs, # Use smaller output len for fast test. 32, ]) +# Speculative decoding is disabled when sequences reach decoding and the batch +# consists of single-token requests. Hence we set `max_num_seqs` +# >= `speculative_disable_by_batch_size` to test feature interaction. +@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) @pytest.mark.parametrize("seed", [1]) def test_mlp_disable_queue(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, seed: int, + test_llm_kwargs, batch_size: int, + prefill_chunk_size: int, seed: int, output_len: int): """Verify that mlp speculative decoding produces exact equality to without spec decode when speculation is disabled for large batch sizes. """ + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -460,13 +489,15 @@ def test_mlp_disable_queue(vllm_runner, common_llm_kwargs, # Use smaller output len for fast test. 32, ]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) @pytest.mark.parametrize("seed", [1]) def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, - output_len: int, seed: int): + output_len: int, prefill_chunk_size: int, seed: int): """Verify that speculative decoding generates the same output with batch expansion scorer and mqa scorer. """ + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py index a13cca41f99e5..05ad468dd8bc5 100644 --- a/tests/spec_decode/e2e/test_multistep_correctness.py +++ b/tests/spec_decode/e2e/test_multistep_correctness.py @@ -147,20 +147,20 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator, }, ]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - "enable_chunked_prefill": False, - }, - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - "enable_chunked_prefill": True, - "max_num_batched_tokens": 4, - "max_num_seqs": 4, - }, -]) +@pytest.mark.parametrize("test_llm_kwargs", + [{ + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + "enable_chunked_prefill": False, + "disable_logprobs_during_spec_decoding": False + }, { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 3, + "enable_chunked_prefill": True, + "max_num_batched_tokens": 4, + "max_num_seqs": 4, + "disable_logprobs_during_spec_decoding": False + }]) @pytest.mark.parametrize( "output_len", [ @@ -192,6 +192,9 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1( batch_size, max_output_len=output_len, seed=seed, + prompt_logprobs=2, + logprobs=2, + disable_logprobs=False, temperature=0.0, ensure_all_accepted=ensure_all_accepted) diff --git a/tests/spec_decode/e2e/test_ngram_correctness.py b/tests/spec_decode/e2e/test_ngram_correctness.py index e53d169a8fcc3..77f8b8998c8d3 100644 --- a/tests/spec_decode/e2e/test_ngram_correctness.py +++ b/tests/spec_decode/e2e/test_ngram_correctness.py @@ -26,6 +26,7 @@ import pytest +from ..utils import maybe_enable_chunked_prefill from .conftest import run_equality_correctness_test @@ -49,11 +50,13 @@ "speculative_model": "[ngram]", "num_speculative_tokens": 5, "ngram_prompt_lookup_max": 3, + "speculative_disable_mqa_scorer": False, }, { "speculative_model": "[ngram]", "num_speculative_tokens": 5, "ngram_prompt_lookup_max": 3, + "speculative_disable_mqa_scorer": True, }, ]) @pytest.mark.parametrize("output_len", [ @@ -68,15 +71,7 @@ def test_ngram_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, batch_size: int, output_len: int, prefill_chunk_size: int, seed: int): """Verify greedy equality on a tiny model with different batch size.""" - if prefill_chunk_size > 0: - common_llm_kwargs.update( - **{ - "enable_chunked_prefill": True, - "max_num_batched_tokens": prefill_chunk_size, - "max_num_seqs": prefill_chunk_size - }) - else: - common_llm_kwargs["enable_chunked_prefill"] = False + maybe_enable_chunked_prefill(prefill_chunk_size, common_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, diff --git a/tests/spec_decode/test_scorer.py b/tests/spec_decode/test_scorer.py index 0b1509d8b7785..5a093dea16d40 100644 --- a/tests/spec_decode/test_scorer.py +++ b/tests/spec_decode/test_scorer.py @@ -60,6 +60,7 @@ def test_scorer(model_name: str, batch_size: int, max_propose_len: int, num_gpu_blocks = 2048 // block_size scorer_worker = create_worker(Worker, model_name, block_size, num_gpu_blocks, seed) + scorer_worker.model_runner.disable_logprobs = True # accessed by mqa_scorer scorer_worker.model_runner.model.sampler.include_gpu_probs_tensor = True scorer_worker.model_runner.model.sampler.\ should_modify_greedy_probs_inplace = True diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index caf7a7e625b46..d8c3af4c1cd1e 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -754,6 +754,7 @@ def test_populate_seq_ids_with_bonus_tokens(): seq_group_metadata_list=seq_group_metadata_list, accepted_token_ids=accepted_token_ids, target_logprobs=target_token_logprobs, + prompt_logprobs=None, k=k, stage_times=(0, 0, 0)) # Verify that _seq_with_bonus_token_in_last_step contains the following: diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py index a4bfa6b2f384b..2f883c2ff9b7a 100644 --- a/tests/spec_decode/utils.py +++ b/tests/spec_decode/utils.py @@ -274,3 +274,15 @@ def create_batch(batch_size, prompts, num_gpu_blocks, block_size, final_prompt_lens, prev_output_tokens, seq_ids) return seq_group_metadata_list, prompts, prev_output_tokens + + +def maybe_enable_chunked_prefill(prefill_chunk_size, llm_kwargs): + if prefill_chunk_size > 0: + llm_kwargs.update( + **{ + "enable_chunked_prefill": True, + "max_num_batched_tokens": prefill_chunk_size, + "max_num_seqs": prefill_chunk_size + }) + else: + llm_kwargs["enable_chunked_prefill"] = False diff --git a/vllm/config.py b/vllm/config.py index dc1d611115489..7ab632d7e3667 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1685,7 +1685,8 @@ def maybe_create_spec_config( raise ValueError("Expect the batch size threshold of disabling " "speculative decoding is > 1, but got " f"{speculative_disable_by_batch_size=}") - + if (enable_chunked_prefill and speculative_model == "eagle"): + raise ValueError("Chunked prefill and EAGLE are not compatible.") # TODO: The user should be able to specify revision/max model len # for the draft model. It is not currently supported. draft_revision = None @@ -1752,12 +1753,6 @@ def maybe_create_spec_config( f"num_speculative_tokens={n_predict}, but " f"{num_speculative_tokens=} was provided.") - if enable_chunked_prefill and draft_hf_config.model_type in ( - "medusa", "mlp_speculator", "eagle"): - raise ValueError( - "Chunked prefill and hidden-state based draft models are " - "not compatible.") - speculative_draft_tensor_parallel_size = \ SpeculativeConfig._verify_and_get_draft_model_tensor_parallel_size( target_parallel_config, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 7da18d5f7d2eb..ab67ae29723cd 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1010,8 +1010,23 @@ def _process_model_outputs(self, self.speculative_config # Organize outputs by [step][sequence group] instead of # [sequence group][step]. - outputs_by_sequence_group = create_output_by_sequence_group( - outputs, num_seq_groups=len(seq_group_metadata_list)) + if self.scheduler_config.is_multi_step: + outputs_by_sequence_group = create_output_by_sequence_group( + outputs, len(seq_group_metadata_list)) + elif self.speculative_config: + # Decodes are multi-steps while prefills are not, outputting at + # most 1 token. Separate them so that we can trigger chunk + # processing without having to pad or copy over prompts K times + # to match decodes structure (costly with prompt_logprobs). + num_prefills = sum(sg.is_prompt + for sg in seq_group_metadata_list) + prefills, decodes = outputs[:num_prefills], outputs[ + num_prefills:] + outputs_by_sequence_group = create_output_by_sequence_group( + decodes, + num_seq_groups=len(seq_group_metadata_list) - num_prefills) + outputs_by_sequence_group = [p.outputs for p in prefills + ] + outputs_by_sequence_group # We have outputs for multiple steps submitted in a single burst, # so invalidate is_first_step_output. is_first_step_output = None diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index 01b9cdad963da..56fb9ba506a44 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -83,13 +83,13 @@ def score_proposals( if not non_spec_indices: # All sequence groups in batch have spec decoding enabled - contracted = self._contract_batch_all_spec( + return self._contract_batch_all_spec( target_sampler_output=target_sampler_output, proposals=proposals, ) else: # Batch has a mix of spec decode enabled and disabled seq groups - contracted = self._contract_batch( + return self._contract_batch( execute_model_req.seq_group_metadata_list, target_sampler_output=target_sampler_output, proposals=proposals, @@ -99,14 +99,6 @@ def score_proposals( k=execute_model_req.num_lookahead_slots, ) - all_tokens, all_probs, spec_logprobs, all_hidden_states = contracted - return SpeculativeScores( - probs=all_probs, - token_ids=all_tokens, - logprobs=spec_logprobs, - hidden_states=all_hidden_states, - ) - def _expand_batch( self, seq_group_metadata_list: List[SequenceGroupMetadata], @@ -143,13 +135,57 @@ def _expand_batch( return (spec_indices, non_spec_indices, target_seq_group_metadata_list, num_scoring_tokens) + def _contract_non_speculative( + self, scores: SpeculativeScores, + seq_group_metadata_list: List[SequenceGroupMetadata], + non_spec_indices: List[int], non_spec_outputs: SpeculativeScores, + has_prompt_log: bool) -> SpeculativeScores: + """ + Augment input `scores` with non-speculative requests outputs. + This includes decode requests with speculation turned off, as well + as prefill requests when `enable_chunked_prefill` is set. + For the latter, prefills are further separated into terminal and + non-terminal chunks (from which no token is sampled). + """ + if not non_spec_indices: + return scores + + if has_prompt_log: + # When prompt_logprobs is enabled, prefills yield output token + # (and respective prob) in the last entry (prompt|out): + # [.|.|.|prefill0_out|.|prefill1_out|decode0_out|..]. + # With chunked prefill, non-terminal chunks have -1 on each + # position: they're still picked, but they're discarded later. + seq_meta = seq_group_metadata_list + nospec_sizes = torch.tensor([ + seq_meta[i].token_chunk_size if seq_meta[i].is_prompt else 1 + for i in non_spec_indices + ]) + nospec_sampled_token_idxs = torch.cumsum(nospec_sizes, 0).add_(-1) + else: + # In this case only sampled tokens are returned, select all. + nospec_sampled_token_idxs = list( + range(len(non_spec_outputs.token_ids))) + + scores.token_ids[non_spec_indices, :1] = \ + non_spec_outputs.token_ids[nospec_sampled_token_idxs].unsqueeze(1) + scores.probs[non_spec_indices, :1, :] = \ + non_spec_outputs.probs[nospec_sampled_token_idxs].unsqueeze(1) + scores.logprobs[non_spec_indices, :1, :] = \ + non_spec_outputs.logprobs[nospec_sampled_token_idxs].unsqueeze(1) + if scores.hidden_states is not None: + assert non_spec_outputs.hidden_states is not None + scores.hidden_states[non_spec_indices, :1, :] = \ + non_spec_outputs.hidden_states[nospec_sampled_token_idxs].unsqueeze(1) + return scores + def _contract_batch( - self, contracted_seq_group_metadata_list: List[SequenceGroupMetadata], - target_sampler_output: SamplerOutput, proposals: SpeculativeProposals, - num_scoring_tokens: int, non_spec_indices: List[int], - spec_indices: List[int], k: int - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, - Optional[torch.Tensor]]: + self, + contracted_seq_group_metadata_list: List[SequenceGroupMetadata], + target_sampler_output: SamplerOutput, + proposals: SpeculativeProposals, num_scoring_tokens: int, + non_spec_indices: List[int], spec_indices: List[int], + k: int) -> SpeculativeScores: """Contract the expanded batch back into its original size. This maps the scores of speculative tokens back to their original sequences. @@ -195,23 +231,28 @@ def _contract_batch( else: all_hidden_states = None - # Rule out prefills that produce no tokens. - non_spec_indices = [ - idx for idx in non_spec_indices - if contracted_seq_group_metadata_list[idx].do_sample - ] - if len(non_spec_indices): - all_tokens[non_spec_indices, :1] = \ - non_spec_target_token_ids.unsqueeze(1) - all_probs[non_spec_indices, :1, :] = \ - non_spec_target_probs.unsqueeze(1) - all_logprobs[non_spec_indices, :1, :] = \ - non_spec_target_logprobs.unsqueeze(1) - if all_hidden_states is not None: - assert non_spec_target_hidden_states is not None - all_hidden_states[non_spec_indices, :1, :] = \ - non_spec_target_hidden_states.unsqueeze(1) - + has_prompt_log = any((sg.sampling_params.prompt_logprobs + and sg.sampling_params.prompt_logprobs > 0) + for sg in contracted_seq_group_metadata_list) + # When prompt logprobs is enabled, lens of returned tensors go from + # n_sampled (requests with do_sample=True) to n_prompt+n_prefills. + # We adjust stride accordingly to get the generated tokens and + # their probs, but pass on prompt_logprobs as is. + prompt_logprobs = None + if (not self._scorer_worker.model_runner.disable_logprobs\ + and has_prompt_log): + prompt_logprobs = [ + o.prompt_logprobs for o in target_sampler_output.outputs + ] + elif not has_prompt_log: + # When prompt logprobs are not to be returned, + # we can ignore non-terminal chunks (no out token). + non_spec_indices = [ + idx for idx in non_spec_indices + if contracted_seq_group_metadata_list[idx].do_sample + ] + + # "Contract" speculative. if spec_indices: all_tokens[spec_indices] = target_token_ids all_probs[spec_indices] = target_probs @@ -219,14 +260,27 @@ def _contract_batch( if all_hidden_states is not None: all_hidden_states[spec_indices] = target_hidden_states - return all_tokens, all_probs, all_logprobs, all_hidden_states + spec_scores = SpeculativeScores(probs=all_probs, + token_ids=all_tokens, + logprobs=all_logprobs, + hidden_states=all_hidden_states, + prompt_logprobs=prompt_logprobs) + + non_spec_outputs = SpeculativeScores( + probs=non_spec_target_probs, + token_ids=non_spec_target_token_ids, + logprobs=non_spec_target_logprobs, + hidden_states=non_spec_target_hidden_states) + # Contract remaining nonspec entries based on non_spec_indices, if any. + return self._contract_non_speculative( + spec_scores, contracted_seq_group_metadata_list, non_spec_indices, + non_spec_outputs, has_prompt_log) def _contract_batch_all_spec( self, target_sampler_output: SamplerOutput, proposals: SpeculativeProposals, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, - Optional[torch.Tensor]]: + ) -> SpeculativeScores: """Contract the expanded batch back into its original size. This maps the scores of speculative tokens back to their original sequences. @@ -250,8 +304,11 @@ def _contract_batch_all_spec( target_hidden_states = target_hidden_states.reshape( *target_token_ids.shape, target_hidden_states.shape[-1]) - return (target_token_ids, target_probs, target_logprobs, - target_hidden_states) + return SpeculativeScores(probs=target_probs, + token_ids=target_token_ids, + logprobs=target_logprobs, + hidden_states=target_hidden_states, + prompt_logprobs=None) def _create_scoring_model_input( self, diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py index a4fe0f13c8db1..c39e98b6cca12 100644 --- a/vllm/spec_decode/interfaces.py +++ b/vllm/spec_decode/interfaces.py @@ -1,10 +1,10 @@ from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import Optional, Set, Union +from typing import List, Optional, Set, Union import torch -from vllm.sequence import ExecuteModelRequest +from vllm.sequence import ExecuteModelRequest, PromptLogprobs from vllm.worker.worker_base import WorkerBase @@ -54,6 +54,10 @@ class SpeculativeScores: # Optional last hidden states from the scoring model. hidden_states: Optional[torch.Tensor] = None + # Scoring model may also return logprobs for prompt tokens + # for each request, when chunked prefill is enabled. + prompt_logprobs: Optional[List[PromptLogprobs]] = None + def __repr__(self): return (f"SpeculativeScores(" f"probs={self.probs.shape}, " diff --git a/vllm/spec_decode/mqa_scorer.py b/vllm/spec_decode/mqa_scorer.py index cbf793e2043e3..3aea2eabb4144 100644 --- a/vllm/spec_decode/mqa_scorer.py +++ b/vllm/spec_decode/mqa_scorer.py @@ -72,9 +72,15 @@ def score_proposals( target_token_ids = target_sampler_output.sampled_token_ids target_probs = target_sampler_output.sampled_token_probs target_logprobs = target_sampler_output.logprobs + prompt_logprobs = None + # If all requests have the same number of query tokens, we can avoid # the for loop to build output for better performance. if min(all_proposal_lengths) == k: + # Regular decodes only. + assert all(not sg.is_prompt + for sg in target_seq_group_metadata_list + if sg.is_prompt) bs, _ = proposals.proposal_token_ids.shape all_tokens = target_token_ids.reshape(bs, k + 1) all_probs = target_probs.reshape(bs, k + 1, self._vocab_size) @@ -88,19 +94,56 @@ def score_proposals( all_logprobs = target_logprobs.new_full(size=all_probs.shape, fill_value=-float("inf")) target_token_ids = target_token_ids.flatten() - start_loc = 0 - for i, (proposed_len, seq_meta) in enumerate( - zip(all_proposal_lengths, target_seq_group_metadata_list)): + + # When prompt logprobs is enabled, lens of returned tensors go from + # n_sampled (requests with do_sample=True) to n_prompt+n_prefills. + # We adjust stride accordingly to get the generated tokens and + # their probs, but pass on prompt_logprobs as is, since it may be + # that n_prompts >> K. + has_prompt_log = any((sg.sampling_params.prompt_logprobs + and sg.sampling_params.prompt_logprobs > 0) + for sg in target_seq_group_metadata_list) + # TODO (NickLucche) we should surface `disable_logprobs` as to not + # break abstraction to get its value. + if (not self._scorer_worker.model_runner.disable_logprobs\ + and has_prompt_log): + prompt_logprobs = [ + o.prompt_logprobs for o in target_sampler_output.outputs + ] + + # Split loop into prefill|decode for readability. + start_loc, i = 0, 0 + while i < len(target_seq_group_metadata_list + ) and target_seq_group_metadata_list[i].is_prompt: + seq_meta = target_seq_group_metadata_list[i] + end_loc = start_loc + if has_prompt_log: + end_loc += seq_meta.token_chunk_size + elif seq_meta.do_sample: + end_loc += 1 + # Skip chunks with no output tokens. if seq_meta.do_sample: - output_len = proposed_len + 1 - end_loc = start_loc + output_len - all_tokens[ - i, :output_len] = target_token_ids[start_loc:end_loc] - all_probs[i, :output_len] = target_probs[start_loc:end_loc] - all_logprobs[ - i, :output_len] = target_logprobs[start_loc:end_loc] - start_loc = end_loc + # Get sampled token (last position in chunk) and its prob. + all_tokens[i, 0] = target_token_ids[end_loc - 1] + all_probs[i, 0] = target_probs[end_loc - 1] + all_logprobs[i, 0] = target_logprobs[end_loc - 1] + + i += 1 + start_loc = end_loc + # Decodes. + while i < len(target_seq_group_metadata_list): + proposed_len, seq_meta = all_proposal_lengths[ + i], target_seq_group_metadata_list[i] + output_len = proposed_len + 1 + end_loc = start_loc + output_len + all_tokens[ + i, :output_len] = target_token_ids[start_loc:end_loc] + all_probs[i, :output_len] = target_probs[start_loc:end_loc] + all_logprobs[ + i, :output_len] = target_logprobs[start_loc:end_loc] + start_loc = end_loc + i += 1 hidden_states = None if target_sampler_output.hidden_states is not None: @@ -110,4 +153,5 @@ def score_proposals( return SpeculativeScores(probs=all_probs, token_ids=all_tokens, logprobs=all_logprobs, - hidden_states=hidden_states) + hidden_states=hidden_states, + prompt_logprobs=prompt_logprobs) diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 0d66ede3d907a..8e9802c7d333c 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -563,50 +563,57 @@ def _serialize_sampler_output_no_logprobs( (seq_id, seq_data) for sg in \ execute_model_req.seq_group_metadata_list \ for seq_id, seq_data in sg.seq_data.items() - if sg.do_sample # ignore empty token sequences ] completion_seq_group_output_list: List[ CompletionSequenceGroupOutput] = [] output_index = 0 # Make sure the non-terminal prefill chunks are still aligned with # their own empty output. - for seq_group_meta in execute_model_req.seq_group_metadata_list: - # Since we can get chunks here, we dont always have a sampled token - # (only on last chunk) but we still have to provide an output. - if not seq_group_meta.do_sample: - completion_seq_group_output_list.append( - CompletionSequenceGroupOutput(samples=[], - prompt_logprobs=None)) - else: - # Sequence with output. - seq_id, seq_data = seq_data_entries[output_index] - needs_prompt_logprobs = seq_output_prompt_logprobs[ - output_index] - if needs_prompt_logprobs: - prompt_token_ids = seq_data.get_prompt_token_ids() - prompt_logprobs = [ - create_logprobs_output( - token_id=p_token_id, - token_id_logprob_rank=-1, - token_id_logprob=0.0, - topk_token_ids=[], - topk_logprobs=[], - ) - # no prompt logprobs for the first token - for p_token_id in prompt_token_ids[1:] - ] - else: - prompt_logprobs = None - completion_seq_group_output_list.append( - create_sequence_group_output( - token_id=sampled_token_ids_list[output_index][0], + for idx, seq_group_meta in enumerate( + execute_model_req.seq_group_metadata_list): + needs_prompt_logprobs = seq_output_prompt_logprobs[idx] + seq_id, seq_data = seq_data_entries[idx] + if needs_prompt_logprobs: + prompt_token_ids = seq_data.get_prompt_token_ids() + + # Some of these sequences may belong to non-terminal chunks, + # which may still have to report logprobs for prompts. + start = 1 if seq_data._num_computed_tokens == 0 \ + else seq_data._num_computed_tokens + end = (seq_data._num_computed_tokens + \ + seq_group_meta.token_chunk_size) + prompt_token_ids = prompt_token_ids[start:end] + prompt_logprobs = [ + create_logprobs_output( + token_id=p_token_id, token_id_logprob_rank=-1, token_id_logprob=0.0, - seq_id=seq_id, topk_token_ids=[], topk_logprobs=[], - prompt_logprobs=prompt_logprobs)) - output_index += 1 + ) for p_token_id in prompt_token_ids + ] + else: + prompt_logprobs = None + + # Since we can get chunks here, we dont always have a sampled token + # (only on last chunk) but we still have to provide an output. + if not seq_group_meta.do_sample: + completion_seq_group_output_list.append( + CompletionSequenceGroupOutput( + samples=[], prompt_logprobs=prompt_logprobs)) + continue + + # Sequence with output. + completion_seq_group_output_list.append( + create_sequence_group_output( + token_id=sampled_token_ids_list[output_index][0], + token_id_logprob_rank=-1, + token_id_logprob=0.0, + seq_id=seq_id, + topk_token_ids=[], + topk_logprobs=[], + prompt_logprobs=prompt_logprobs)) + output_index += 1 return [SamplerOutput(outputs=completion_seq_group_output_list)] @@ -624,24 +631,27 @@ def _run_no_spec(self, execute_model_req: ExecuteModelRequest, assert len(sampler_output) == 1 sampler_output = sampler_output[0] - # Store hidden states from target model execution. + # Store hidden states from target model execution, BxD. hidden_states = sampler_output.hidden_states if hidden_states is not None: - # remove hidden_states for prompt tokens - # TODO Enable `return_hidden_states`: prefill chunks hidden states - # are pruned by the logits processor. Also, they should be arranged - # back into full-prefill latent. Address it to enable MLPSpeculator. - if any(seq.is_prompt - for seq in execute_model_req.seq_group_metadata_list): + # Only decodes and prefill terminal chunks need a hidden state. + seq_group_meta_with_hidden = [ + sg for sg in execute_model_req.seq_group_metadata_list + if sg.do_sample + ] + if any(seq.is_prompt for seq in seq_group_meta_with_hidden): + # Drop hidden_states with no prediction (eg non-terminal chunks) hidden_states = hidden_states[ torch.where(sampler_output.sampled_token_ids - VLLM_INVALID_TOKEN_ID)[0]] - if self.previous_hidden_states is None: + if self.previous_hidden_states is None and len( + seq_group_meta_with_hidden): self.previous_hidden_states = HiddenStates( - hidden_states, execute_model_req.seq_group_metadata_list) - else: - self.previous_hidden_states.update( - hidden_states, execute_model_req.seq_group_metadata_list) + hidden_states, seq_group_meta_with_hidden) + elif self.previous_hidden_states and len( + seq_group_meta_with_hidden): + self.previous_hidden_states.update(hidden_states, + seq_group_meta_with_hidden) if not skip_proposer: # We prepare the prefill hidden states here so that there no @@ -752,13 +762,13 @@ def _run_speculative_decoding_step( ] if len(non_spec_indices): all_hidden_states = proposal_scores.hidden_states - # TODO fix `return_hidden_states`, same as in `_run_no_spec` if all_hidden_states is not None: prefill_hidden_states = all_hidden_states[non_spec_indices] execute_model_req.previous_hidden_states = \ prepare_prefill_hidden_states(prefill_hidden_states) # Sync proposer KV cache for prefills. prefill_req = execute_model_req.clone(non_spec_seqs) + # TODO avoid sampling here? self.proposer_worker.execute_model(prefill_req) with Timer() as verification_timer: @@ -774,6 +784,8 @@ def _run_speculative_decoding_step( execute_model_req.seq_group_metadata_list, accepted_token_ids, target_logprobs=target_logprobs, + prompt_logprobs=proposal_scores.prompt_logprobs + if not self._disable_logprobs else None, k=execute_model_req.num_lookahead_slots, stage_times=stage_times) @@ -845,19 +857,32 @@ def _verify_tokens( # metadata. accepted_token_ids[original_indices] = accepted_token_ids.clone() + # B x K+1 x D hidden_states = proposal_scores.hidden_states if hidden_states is not None: + # Only get terminal hidden states for next step + terminal_metadata = [ + sg for sg in seq_group_metadata_list if sg.do_sample + ] + # Contract hidden states based on accepted tokens hs_size = hidden_states.shape[-1] - accepted_index = accepted_token_ids + 1 # Convert -1 to 0 - accepted_index = accepted_index.count_nonzero(dim=1).add_(-1) - index = accepted_index[:, None, None].expand(-1, 1, hs_size) + accepted_index = accepted_index.count_nonzero(dim=1).add_(-1) # b + # Drop non-terminal prefill chunks hidden states. + hidden_states = hidden_states[ + accepted_index != VLLM_INVALID_TOKEN_ID] + accepted_index = accepted_index[ + accepted_index != VLLM_INVALID_TOKEN_ID] + assert len(accepted_index) == hidden_states.shape[0] == len( + terminal_metadata) + index = accepted_index[:, None, None].expand(-1, 1, + hs_size) # b x 1 x d second_last_token_hidden_states = hidden_states[:, -2] # b x d hidden_states = hidden_states.gather(1, index).squeeze(1) # b x d # Store hidden states from target model for subsequent decode step self.previous_hidden_states = HiddenStates( - hidden_states, seq_group_metadata_list, + hidden_states, terminal_metadata, second_last_token_hidden_states) return accepted_token_ids, logprobs @@ -866,6 +891,8 @@ def _create_output_sampler_list( seq_group_metadata_list: List[SequenceGroupMetadata], accepted_token_ids: torch.Tensor, # shape: [batch_size, k+1] target_logprobs: torch.Tensor, # shape: [batch_size, k+1, vocab_size] + prompt_logprobs: Optional[ + torch.Tensor], # shape: [nprompt_tokens, vocab_size] k: int, stage_times: Tuple[float, float, float], ) -> List[SamplerOutput]: @@ -909,15 +936,89 @@ def _create_output_sampler_list( # Construct the output on a per-step, per-sequence basis. # Non-terminal prefill chunks will end up here as rows with just -1s - # i.e mixed-batch [[-1, 1576], [-1, 29884], [-1, -1], [-1, -1]] + # i.e mixed-batch [[-1, 1576], [-1, 29884], [-1, -1], [-1, -1]] while + # terminal chunks will only have one generated token at time 0. sampler_output_list: List[SamplerOutput] = [] + + # Prefills are not multi-step (return at most 1 token), in order to + # avoid padding or repetition to fit decodes, we separate them. + for i, sg in enumerate(seq_group_metadata_list): + if not sg.is_prompt: + # Requests are ordered as prefills|decodes=>no more prefills. + break + num_logprobs = num_logprobs_per_seq[i] + seq_kwargs = dict(token_id=-1, + token_id_logprob_rank=0, + token_id_logprob=-float('inf'), + topk_token_ids=[-1] * num_logprobs, + topk_logprobs=[-float('inf')] * num_logprobs, + seq_id=seq_ids[i]) + # Terminal chunk, has token. + if sg.do_sample: + seq_kwargs.update( + dict( + token_id=accepted_token_ids[i][0].item(), + token_id_logprob_rank=accepted_token_id_ranks_by_step[ + 0][i], + token_id_logprob=accepted_token_id_logprobs_by_step[0] + [i], + topk_token_ids=topk_indices_by_step[0][i] + [:num_logprobs], + # output only so step is 0 + topk_logprobs=topk_logprobs_by_step[0][i] + [:num_logprobs], + )) + needs_plogs = (sg.sampling_params.prompt_logprobs + and sg.sampling_params.prompt_logprobs > 0) + plogs = None + if prompt_logprobs is not None: + # Even non-terminal prompt chunks can have logprobs here. + plogs = prompt_logprobs[i] + elif needs_plogs: + # Prompt logprobs are requested but `_disable_logprobs` is set. + seq_data = next(iter(sg.seq_data.values())) + # Get only the tokens in this chunk! + prompt_token_ids = seq_data.get_prompt_token_ids() + prompt_token_ids = prompt_token_ids[ + seq_data. + _num_computed_tokens:seq_data._num_computed_tokens + + sg.token_chunk_size] + + is_first_chunk = seq_data._num_computed_tokens == 0 + # There's no prob generated for the first token in a sequence. + if is_first_chunk: + prompt_token_ids = prompt_token_ids[1:] + plogs = [ + create_logprobs_output( + token_id=p_token_id, + token_id_logprob_rank=-1, + token_id_logprob=0.0, + topk_token_ids=[], + topk_logprobs=[], + ) for p_token_id in prompt_token_ids + ] + seq_kwargs.update(dict(prompt_logprobs=plogs)) + + sampler_output_list.append( + SamplerOutput( + outputs=[create_sequence_group_output( + **seq_kwargs)])) # type: ignore + + # Decodes, create one SamplerOutput per-step (at most K+1). for step_index in range(num_steps): - if all(token_id == -1 - for token_id in accepted_token_ids_by_step[step_index]): + if all(token_id == -1 for sg, token_id in zip( + seq_group_metadata_list, + accepted_token_ids_by_step[step_index]) + if not sg.is_prompt): break step_output_token_ids: List[CompletionSequenceGroupOutput] = [] for sequence_index in range(batch_size): + seq_meta = seq_group_metadata_list[sequence_index] + # Prompts already processed above. + if seq_meta.is_prompt: + continue + # Each sequence may have a different num_logprobs; retrieve it. num_logprobs = num_logprobs_per_seq[sequence_index] step_output_token_ids.append( @@ -952,6 +1053,8 @@ def _create_output_sampler_list( # This is periodic because the rejection sampler emits metrics # periodically. self._maybe_log_stage_times(*stage_times) + # First `n_prefills` entries will contain prefills SamplerOutput when + # chunked prefill is enabled, the rest is decodes in multi-step format. return sampler_output_list def _maybe_log_stage_times(self, average_time_per_proposal_tok_ms: float, From 823ab796330825f4052d771e2c462ad3b55236eb Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 28 Jan 2025 00:23:08 +0000 Subject: [PATCH 09/69] Update `pre-commit` hooks (#12475) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .pre-commit-config.yaml | 10 +- benchmarks/benchmark_serving.py | 4 +- csrc/custom_all_reduce.cuh | 8 +- csrc/moe/marlin_kernels/marlin_moe_kernel.h | 8 +- csrc/quantization/gptq_marlin/gptq_marlin.cu | 16 +-- .../marlin/dense/marlin_cuda_kernel.cu | 4 +- .../marlin/qqq/marlin_qqq_gemm_kernel.cu | 4 +- csrc/quantization/marlin/sparse/common/mma.h | 4 +- csrc/rocm/attention.cu | 4 +- setup.py | 2 +- tests/kernels/test_block_fp8.py | 25 ++-- tests/kv_transfer/test_lookup_buffer.py | 10 +- tests/lora/test_qwen2vl.py | 6 +- .../vision_language/test_models.py | 130 ++++++++++-------- .../vision_language/test_pixtral.py | 17 ++- tests/quantization/test_compressed_tensors.py | 6 +- tests/samplers/test_rejection_sampler.py | 15 +- tools/report_build_time_ninja.py | 5 +- vllm/_custom_ops.py | 4 +- vllm/attention/ops/prefix_prefill.py | 28 ++-- vllm/attention/ops/triton_flash_attention.py | 4 +- vllm/attention/selector.py | 4 +- vllm/config.py | 7 +- vllm/core/block/common.py | 7 +- vllm/core/block_manager.py | 4 +- vllm/core/scheduler.py | 23 ++-- .../device_communicators/shm_broadcast.py | 8 +- vllm/distributed/parallel_state.py | 8 +- vllm/entrypoints/chat_utils.py | 4 +- vllm/entrypoints/openai/serving_completion.py | 9 +- .../granite_20b_fc_tool_parser.py | 4 +- vllm/lora/layers.py | 12 +- vllm/lora/models.py | 5 +- vllm/lora/ops/triton_ops/sgmv_expand.py | 5 +- vllm/lora/ops/triton_ops/sgmv_shrink.py | 4 +- .../kernels/mixed_precision/MPLinearKernel.py | 12 +- .../kernels/scaled_mm/ScaledMMLinearKernel.py | 14 +- .../layers/quantization/utils/fp8_utils.py | 7 +- .../layers/quantization/utils/w8a8_utils.py | 4 +- vllm/model_executor/layers/sampler.py | 7 +- .../layers/vocab_parallel_embedding.py | 16 +-- vllm/model_executor/model_loader/loader.py | 5 +- .../model_executor/model_loader/tensorizer.py | 4 +- vllm/model_executor/models/gemma.py | 4 +- vllm/model_executor/models/granitemoe.py | 6 +- vllm/model_executor/models/mllama.py | 4 +- vllm/model_executor/models/mlp_speculator.py | 4 +- vllm/model_executor/models/phimoe.py | 8 +- vllm/model_executor/models/registry.py | 3 +- vllm/model_executor/models/ultravox.py | 8 +- vllm/model_executor/models/utils.py | 5 +- vllm/model_executor/sampling_metadata.py | 11 +- vllm/platforms/neuron.py | 4 +- vllm/scalar_type.py | 4 +- vllm/spec_decode/spec_decode_worker.py | 4 +- vllm/spec_decode/top1_proposer.py | 10 +- vllm/spec_decode/util.py | 12 +- vllm/transformers_utils/configs/nemotron.py | 4 +- vllm/utils.py | 10 +- vllm/v1/core/scheduler.py | 4 +- vllm/v1/stats/common.py | 4 +- vllm/v1/worker/gpu_model_runner.py | 2 +- vllm/worker/hpu_worker.py | 8 +- vllm/worker/tpu_model_runner.py | 4 +- 64 files changed, 322 insertions(+), 288 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 432bf5ed18dbc..7b32df90bfd8b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,18 +3,18 @@ default_stages: - manual # Run in CI repos: - repo: https://github.com/google/yapf - rev: v0.32.0 + rev: v0.43.0 hooks: - id: yapf args: [--in-place, --verbose] additional_dependencies: [toml] # TODO: Remove when yapf is upgraded - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.6.5 + rev: v0.9.3 hooks: - id: ruff args: [--output-format, github] - repo: https://github.com/codespell-project/codespell - rev: v2.3.0 + rev: v2.4.0 hooks: - id: codespell exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*' @@ -23,7 +23,7 @@ repos: hooks: - id: isort - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v18.1.5 + rev: v19.1.7 hooks: - id: clang-format exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))' @@ -35,7 +35,7 @@ repos: - id: pymarkdown files: docs/.* - repo: https://github.com/rhysd/actionlint - rev: v1.7.6 + rev: v1.7.7 hooks: - id: actionlint - repo: local diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 63d2c3f7c7dd9..8b3212831e7e0 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -926,8 +926,8 @@ def main(args: argparse.Namespace): ) # Traffic - result_json["request_rate"] = ( - args.request_rate if args.request_rate < float("inf") else "inf") + result_json["request_rate"] = (args.request_rate if args.request_rate + < float("inf") else "inf") result_json["burstiness"] = args.burstiness result_json["max_concurrency"] = args.max_concurrency diff --git a/csrc/custom_all_reduce.cuh b/csrc/custom_all_reduce.cuh index 6be4d4f2b2eb8..b9df4ed160b03 100644 --- a/csrc/custom_all_reduce.cuh +++ b/csrc/custom_all_reduce.cuh @@ -38,9 +38,13 @@ struct Signal { alignas(128) FlagType peer_counter[2][kMaxBlocks][8]; }; -struct __align__(16) RankData { const void* __restrict__ ptrs[8]; }; +struct __align__(16) RankData { + const void* __restrict__ ptrs[8]; +}; -struct __align__(16) RankSignals { Signal* signals[8]; }; +struct __align__(16) RankSignals { + Signal* signals[8]; +}; // like std::array, but aligned template diff --git a/csrc/moe/marlin_kernels/marlin_moe_kernel.h b/csrc/moe/marlin_kernels/marlin_moe_kernel.h index a217401b3d7c2..47ecf109d0f53 100644 --- a/csrc/moe/marlin_kernels/marlin_moe_kernel.h +++ b/csrc/moe/marlin_kernels/marlin_moe_kernel.h @@ -138,8 +138,8 @@ __device__ inline FragB dequant(int q) { const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); - int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); + int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); + int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // directly into `SUB` and `ADD`. const int SUB = 0x64086408; @@ -182,8 +182,8 @@ __device__ inline FragB dequant(int q) { const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); - int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); + int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); + int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); const int SUB = 0x64006400; const int MUL = 0x2c002c00; diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu index 04ef842fbdf95..7c33fea93d6ae 100644 --- a/csrc/quantization/gptq_marlin/gptq_marlin.cu +++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu @@ -173,8 +173,8 @@ dequant(int q) { const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); - int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); + int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); + int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // directly into `SUB` and `ADD`. const int SUB = 0x64086408; @@ -197,9 +197,9 @@ dequant(int q) { // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); + int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); q >>= 4; - int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); + int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); typename ScalarType::FragB frag_b; static constexpr uint32_t MUL = 0x3F803F80; @@ -221,8 +221,8 @@ dequant(int q) { const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); - int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); + int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); + int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); const int SUB = 0x64006400; const int MUL = 0x2c002c00; @@ -244,9 +244,9 @@ dequant(int q) { // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); + int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); q >>= 4; - int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); + int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); typename ScalarType::FragB frag_b; static constexpr uint32_t MUL = 0x3F803F80; diff --git a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu index c03fef886e4db..4db8f5dcdabf6 100644 --- a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu +++ b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu @@ -96,8 +96,8 @@ __device__ inline FragB dequant(int q) { const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); - int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); + int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); + int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // directly into `SUB` and `ADD`. const int SUB = 0x64086408; diff --git a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu index 103a6444f3a21..048a3f736fb71 100644 --- a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu +++ b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu @@ -141,8 +141,8 @@ __device__ inline FragB dequant_per_group(int q, FragS_GROUP& frag_s, int i) { static constexpr uint32_t HI = 0x00f000f0; static constexpr uint32_t EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - uint32_t t0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); - uint32_t t1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); + uint32_t t0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); + uint32_t t1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // directly into `SUB` and `ADD`. static constexpr uint32_t SUB = 0x64086408; diff --git a/csrc/quantization/marlin/sparse/common/mma.h b/csrc/quantization/marlin/sparse/common/mma.h index b26505f771c8b..49eee4128ee7c 100644 --- a/csrc/quantization/marlin/sparse/common/mma.h +++ b/csrc/quantization/marlin/sparse/common/mma.h @@ -127,8 +127,8 @@ __device__ inline FragB dequant_4bit(int q) { const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); - int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); + int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); + int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // directly into `SUB` and `ADD`. const int SUB = 0x64086408; diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu index 9477790629c9f..ffa9d44610a7f 100644 --- a/csrc/rocm/attention.cu +++ b/csrc/rocm/attention.cu @@ -907,7 +907,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, // max_num_partitions, head_size] const int* __restrict__ context_lens, // [num_seqs] - const int max_num_partitions){UNREACHABLE_CODE} + const int max_num_partitions) { + UNREACHABLE_CODE +} #endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support diff --git a/setup.py b/setup.py index ee193e4693806..59ece870b5585 100755 --- a/setup.py +++ b/setup.py @@ -417,7 +417,7 @@ def get_rocm_version(): if (get_rocm_core_version(ctypes.byref(major), ctypes.byref(minor), ctypes.byref(patch)) == 0): - return "%d.%d.%d" % (major.value, minor.value, patch.value) + return f"{major.value}.{minor.value}.{patch.value}" return None except Exception: return None diff --git a/tests/kernels/test_block_fp8.py b/tests/kernels/test_block_fp8.py index a16cc4582a180..f28fdf3feedbc 100644 --- a/tests/kernels/test_block_fp8.py +++ b/tests/kernels/test_block_fp8.py @@ -92,8 +92,10 @@ def native_w8a8_block_fp8_matmul(A, A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles) ] B_tiles = [[ - B[j * block_n:min((j + 1) * block_n, N), - i * block_k:min((i + 1) * block_k, K), ] for i in range(k_tiles) + B[ + j * block_n:min((j + 1) * block_n, N), + i * block_k:min((i + 1) * block_k, K), + ] for i in range(k_tiles) ] for j in range(n_tiles)] C_tiles = [ C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles) @@ -157,9 +159,9 @@ def setup_cuda(): torch.set_default_device("cuda") -@pytest.mark.parametrize("num_tokens,d,dtype,group_size,seed", - itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, - SEEDS)) +@pytest.mark.parametrize( + "num_tokens,d,dtype,group_size,seed", + itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, SEEDS)) @torch.inference_mode() def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed): torch.manual_seed(seed) @@ -174,9 +176,9 @@ def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed): assert torch.allclose(scale, ref_scale) -@pytest.mark.parametrize("M,N,K,block_size,out_dtype,seed", - itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, - SEEDS)) +@pytest.mark.parametrize( + "M,N,K,block_size,out_dtype,seed", + itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS)) @torch.inference_mode() def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed): torch.manual_seed(seed) @@ -207,9 +209,10 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed): assert rel_diff < 0.001 -@pytest.mark.parametrize("M,N,K,E,topk,block_size,dtype,seed", - itertools.product(M_moe, N_moe, K_moe, E, TOP_KS, - BLOCK_SIZE, DTYPES, SEEDS)) +@pytest.mark.parametrize( + "M,N,K,E,topk,block_size,dtype,seed", + itertools.product(M_moe, N_moe, K_moe, E, TOP_KS, BLOCK_SIZE, DTYPES, + SEEDS)) @torch.inference_mode() def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed): torch.manual_seed(seed) diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py index 718730bb8cbbe..4d6890305af73 100644 --- a/tests/kv_transfer/test_lookup_buffer.py +++ b/tests/kv_transfer/test_lookup_buffer.py @@ -20,7 +20,7 @@ def test_run(my_rank, buffer, device): assert buffer.buffer_size == 0 assert len(buffer.buffer) == 0 - print("My rank: %d, device: %s" % (my_rank, device)) + print(f"My rank: {my_rank}, device: {device}") # insert tokens = torch.tensor([1, 2, 3]).to(device) @@ -48,7 +48,7 @@ def test_run(my_rank, buffer, device): assert buffer.buffer_size == 0 assert len(buffer.buffer) == 0 - print("My rank: %d, Test run passed!" % (my_rank)) + print(f"My rank: {my_rank}, Test run passed!") def stress_test(my_rank, buf, device): @@ -94,7 +94,7 @@ def stress_test(my_rank, buf, device): assert torch.allclose(k, k_) assert torch.allclose(v, v_) assert torch.allclose(h, h_) - print('Rank %d done' % my_rank) + print(f"Rank {my_rank} done") torch.distributed.barrier() if my_rank == 0: @@ -108,7 +108,7 @@ def stress_test(my_rank, buf, device): else: torch.distributed.send(torch.tensor([n]), 0) - print("My rank: %d, Passed stress test!" % (my_rank)) + print(f"My rank: {my_rank}, Passed stress test!") if __name__ == "__main__": @@ -122,7 +122,7 @@ def stress_test(my_rank, buf, device): rank=my_rank, ) - print("initialized! My rank is %d" % my_rank) + print(f"initialized! My rank is {my_rank}") config = KVTransferConfig( kv_connector='PyNcclConnector', diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py index ebdd129db5f6a..570aa3861d0be 100644 --- a/tests/lora/test_qwen2vl.py +++ b/tests/lora/test_qwen2vl.py @@ -55,9 +55,9 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: return generated_texts -@pytest.mark.xfail(current_platform.is_rocm(), - reason="Qwen2-VL dependency xformers incompatible with ROCm" - ) +@pytest.mark.xfail( + current_platform.is_rocm(), + reason="Qwen2-VL dependency xformers incompatible with ROCm") def test_qwen2vl_lora(qwen2vl_lora_files): llm = vllm.LLM( MODEL_PATH, diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 14d9a739be318..d5f0d63288cc1 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -521,12 +521,13 @@ def _mark_splits( # - image embeddings # - video # - custom inputs -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.IMAGE, - fork_new_process_for_each_test=False, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.IMAGE, + fork_new_process_for_each_test=False, + )) def test_single_image_models(tmp_path: PosixPath, model_type: str, test_case: ExpandableVLMTestArgs, hf_runner: Type[HfRunner], @@ -543,12 +544,13 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str, ) -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.MULTI_IMAGE, - fork_new_process_for_each_test=False, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.MULTI_IMAGE, + fork_new_process_for_each_test=False, + )) def test_multi_image_models(tmp_path: PosixPath, model_type: str, test_case: ExpandableVLMTestArgs, hf_runner: Type[HfRunner], @@ -565,12 +567,13 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str, ) -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.EMBEDDING, - fork_new_process_for_each_test=False, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.EMBEDDING, + fork_new_process_for_each_test=False, + )) def test_image_embedding_models(model_type: str, test_case: ExpandableVLMTestArgs, hf_runner: Type[HfRunner], @@ -586,12 +589,13 @@ def test_image_embedding_models(model_type: str, ) -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.VIDEO, - fork_new_process_for_each_test=False, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.VIDEO, + fork_new_process_for_each_test=False, + )) def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs, hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner], video_assets: _VideoAssets): @@ -605,12 +609,13 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs, ) -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.CUSTOM_INPUTS, - fork_new_process_for_each_test=False, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.CUSTOM_INPUTS, + fork_new_process_for_each_test=False, + )) def test_custom_inputs_models( model_type: str, test_case: ExpandableVLMTestArgs, @@ -627,12 +632,13 @@ def test_custom_inputs_models( #### Tests filtering for things running each test as a new process -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.IMAGE, - fork_new_process_for_each_test=True, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.IMAGE, + fork_new_process_for_each_test=True, + )) @fork_new_process_for_each_test def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str, test_case: ExpandableVLMTestArgs, @@ -650,12 +656,13 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str, ) -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.MULTI_IMAGE, - fork_new_process_for_each_test=True, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.MULTI_IMAGE, + fork_new_process_for_each_test=True, + )) @fork_new_process_for_each_test def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str, test_case: ExpandableVLMTestArgs, @@ -673,12 +680,13 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str, ) -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.EMBEDDING, - fork_new_process_for_each_test=True, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.EMBEDDING, + fork_new_process_for_each_test=True, + )) @fork_new_process_for_each_test def test_image_embedding_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, @@ -695,12 +703,13 @@ def test_image_embedding_models_heavy(model_type: str, ) -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.VIDEO, - fork_new_process_for_each_test=True, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.VIDEO, + fork_new_process_for_each_test=True, + )) def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner], @@ -715,12 +724,13 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, ) -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.CUSTOM_INPUTS, - fork_new_process_for_each_test=True, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.CUSTOM_INPUTS, + fork_new_process_for_each_test=True, + )) @fork_new_process_for_each_test def test_custom_inputs_models_heavy( model_type: str, diff --git a/tests/models/decoder_only/vision_language/test_pixtral.py b/tests/models/decoder_only/vision_language/test_pixtral.py index 90c0fab99054c..8103e5305b91b 100644 --- a/tests/models/decoder_only/vision_language/test_pixtral.py +++ b/tests/models/decoder_only/vision_language/test_pixtral.py @@ -135,10 +135,10 @@ def _dump_outputs_w_logprobs( outputs: OutputsLogprobs, filename: "StrPath", ) -> None: - json_data = [(tokens, text, - [{k: asdict(v) - for k, v in token_logprobs.items()} - for token_logprobs in (logprobs or [])]) + json_data = [(tokens, text, [{ + k: asdict(v) + for k, v in token_logprobs.items() + } for token_logprobs in (logprobs or [])]) for tokens, text, logprobs in outputs] with open(filename, "w") as f: @@ -149,11 +149,10 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs: with open(filename, "rb") as f: json_data = json.load(f) - return [(tokens, text, - [{int(k): Logprob(**v) - for k, v in token_logprobs.items()} - for token_logprobs in logprobs]) - for tokens, text, logprobs in json_data] + return [(tokens, text, [{ + int(k): Logprob(**v) + for k, v in token_logprobs.items() + } for token_logprobs in logprobs]) for tokens, text, logprobs in json_data] @large_gpu_test(min_gb=80) diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index bf0d454ad511c..1072697ecf5cc 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -314,9 +314,9 @@ def check_model(model): @pytest.mark.skip(reason="2of4 sparse w16a16 CUTLASS produces bad output.") -@pytest.mark.skipif(not sparse_cutlass_supported(), - reason="2of4 Sparse is not yet supported on this GPU type." - ) +@pytest.mark.skipif( + not sparse_cutlass_supported(), + reason="2of4 Sparse is not yet supported on this GPU type.") @pytest.mark.parametrize( "args_2of4", [("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")]) diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py index 397fa2cc85821..dcb1b27bff37f 100644 --- a/tests/samplers/test_rejection_sampler.py +++ b/tests/samplers/test_rejection_sampler.py @@ -23,16 +23,17 @@ def mock_causal_accepted_tensor( """ batch_size = last_accepted_indices.shape[0] - accepted = (torch.arange(k).expand(batch_size, k) <= - last_accepted_indices.unsqueeze(-1).broadcast_to( + accepted = (torch.arange(k).expand(batch_size, k) + <= last_accepted_indices.unsqueeze(-1).broadcast_to( batch_size, k)) # Sprinkle accepted values after the contiguous initial accepted values. # This replicates the behavior of rejection sampling, which may "accept" # a token that cannot be accepted because of causality. - sprinkle_candidates = ( - torch.arange(k).expand(batch_size, k) > - last_accepted_indices.unsqueeze(-1).broadcast_to(batch_size, k) + 1) + sprinkle_candidates = (torch.arange(k).expand( + batch_size, + k) > last_accepted_indices.unsqueeze(-1).broadcast_to(batch_size, k) + + 1) sprinkle = torch.rand(batch_size, k) > 0.5 accepted[sprinkle_candidates] = sprinkle[sprinkle_candidates] return accepted @@ -445,8 +446,8 @@ def test_rejection_sampling_approximates_target_distribution( distance_wrt_reference) expected_improvement_multiplier = 20 - assert (relative_change_in_distance_wrt_target > - relative_change_in_distance_wrt_reference * + assert (relative_change_in_distance_wrt_target + > relative_change_in_distance_wrt_reference * expected_improvement_multiplier) diff --git a/tools/report_build_time_ninja.py b/tools/report_build_time_ninja.py index 51ad2adc74fe1..9dc19f5fd4cdd 100644 --- a/tools/report_build_time_ninja.py +++ b/tools/report_build_time_ninja.py @@ -274,8 +274,9 @@ def SummarizeEntries(entries, extra_step_types): print(' {:.1f} s weighted time ({:.1f} s elapsed time sum, {:1.1f}x ' 'parallelism)'.format(length, total_cpu_time, total_cpu_time * 1.0 / length)) - print(' %d build steps completed, average of %1.2f/s' % - (len(entries), len(entries) / (length))) + print(' {} build steps completed, average of {:1.2f}/s'.format( + len(entries), + len(entries) / (length))) def main(): diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 440bc52012ab7..85c1121ed6ff8 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -820,8 +820,8 @@ def scaled_int8_quant( if scale is not None: # static-per-tensor quantization. assert symmetric == ( - azp is - None), "azp must only be provided for asymmetric quantization." + azp + is None), "azp must only be provided for asymmetric quantization." torch.ops._C.static_scaled_int8_quant(output, input, scale, azp) return output, scale, azp diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py index e2f2b66dfc90c..ec3c8459c43ef 100644 --- a/vllm/attention/ops/prefix_prefill.py +++ b/vllm/attention/ops/prefix_prefill.py @@ -219,8 +219,8 @@ def _fwd_kernel( float("-inf")) if SLIDING_WINDOW > 0: qk = tl.where( - offs_m[:, None] - - (start_n + offs_n[None, :]) < SLIDING_WINDOW, qk, -10000) + offs_m[:, None] - (start_n + offs_n[None, :]) + < SLIDING_WINDOW, qk, -10000) # -- compute m_ij, p, l_ij m_ij = tl.max(qk, 1) @@ -324,10 +324,10 @@ def _fwd_kernel_flash_attn_v2( (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs + cur_head * stride_qh + offs_d[None, :] * stride_qd) - q = tl.load( - Q + off_q, - mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len, - other=0.0) + q = tl.load(Q + off_q, + mask=offs_m[:, None] + < cur_batch_seq_len - cur_batch_ctx_len, + other=0.0) # # initialize pointer to m and l m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") @@ -402,8 +402,8 @@ def _fwd_kernel_flash_attn_v2( # -- compute qk ---- k = tl.load(k_ptrs + (cur_batch_in_all_start_index + start_n) * stride_kbs, - mask=(start_n + offs_n[None, :]) < - cur_batch_seq_len - cur_batch_ctx_len, + mask=(start_n + offs_n[None, :]) + < cur_batch_seq_len - cur_batch_ctx_len, other=0.0) qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) @@ -430,8 +430,8 @@ def _fwd_kernel_flash_attn_v2( # update acc v = tl.load(v_ptrs + (cur_batch_in_all_start_index + start_n) * stride_vbs, - mask=(start_n + offs_n[:, None]) < - cur_batch_seq_len - cur_batch_ctx_len, + mask=(start_n + offs_n[:, None]) + < cur_batch_seq_len - cur_batch_ctx_len, other=0.0) p = p.to(v.dtype) @@ -639,8 +639,8 @@ def _fwd_kernel_alibi( k = tl.load(k_ptrs + (cur_batch_in_all_start_index + start_n) * stride_kbs, mask=dim_mask[:, None] & - ((start_n + offs_n[None, :]) < - cur_batch_seq_len - cur_batch_ctx_len), + ((start_n + offs_n[None, :]) + < cur_batch_seq_len - cur_batch_ctx_len), other=0.0) qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) @@ -677,8 +677,8 @@ def _fwd_kernel_alibi( v = tl.load(v_ptrs + (cur_batch_in_all_start_index + start_n) * stride_vbs, mask=dim_mask[None, :] & - ((start_n + offs_n[:, None]) < - cur_batch_seq_len - cur_batch_ctx_len), + ((start_n + offs_n[:, None]) + < cur_batch_seq_len - cur_batch_ctx_len), other=0.0) p = p.to(v.dtype) diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py index f94211116a746..ef04603f22b6e 100644 --- a/vllm/attention/ops/triton_flash_attention.py +++ b/vllm/attention/ops/triton_flash_attention.py @@ -627,8 +627,8 @@ def attn_fwd( causal_start_idx, dtype=tl.int32) mask_m_offsets = start_m_idx + tl.arange(0, BLOCK_M) - out_ptrs_mask = (mask_m_offsets[:, None] >= - out_mask_boundary[None, :]) + out_ptrs_mask = (mask_m_offsets[:, None] + >= out_mask_boundary[None, :]) z = 0.0 acc = tl.where(out_ptrs_mask, acc, z.to(acc.type.element_ty)) # write back LSE diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 81ea6eefb5410..1376274d57777 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -1,6 +1,6 @@ import os from contextlib import contextmanager -from functools import lru_cache +from functools import cache from typing import Generator, Optional, Type import torch @@ -100,7 +100,7 @@ def get_attn_backend( ) -@lru_cache(maxsize=None) +@cache def _cached_get_attn_backend( head_size: int, dtype: torch.dtype, diff --git a/vllm/config.py b/vllm/config.py index 7ab632d7e3667..d7c9311ae3cb0 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -67,7 +67,8 @@ _TASK_RUNNER: Dict[_ResolvedTask, RunnerType] = { task: runner - for runner, tasks in _RUNNER_TASKS.items() for task in tasks + for runner, tasks in _RUNNER_TASKS.items() + for task in tasks } HfOverrides = Union[Dict[str, Any], Callable[[PretrainedConfig], @@ -1976,8 +1977,8 @@ def _verify_args(self) -> None: "typical_acceptance_sampler.") if (self.draft_token_acceptance_method != 'rejection_sampler' - and self.draft_token_acceptance_method != - 'typical_acceptance_sampler'): + and self.draft_token_acceptance_method + != 'typical_acceptance_sampler'): raise ValueError( "Expected draft_token_acceptance_method to be either " "rejection_sampler or typical_acceptance_sampler. Instead it " diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py index c03b5932eafb6..115f663e4ad34 100644 --- a/vllm/core/block/common.py +++ b/vllm/core/block/common.py @@ -34,9 +34,10 @@ class RefCounter(RefCounterProtocol): def __init__(self, all_block_indices: Iterable[BlockId]): deduped = set(all_block_indices) - self._refcounts: Dict[BlockId, - RefCount] = {index: 0 - for index in deduped} + self._refcounts: Dict[BlockId, RefCount] = { + index: 0 + for index in deduped + } def incr(self, block_id: BlockId) -> RefCount: assert block_id in self._refcounts diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 62a5f0bda061a..2d6a132ed555b 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -136,8 +136,8 @@ def can_allocate(self, device=Device.GPU) # Use watermark to avoid frequent cache eviction. - if (self.num_total_gpu_blocks - num_required_blocks < - self.watermark_blocks): + if (self.num_total_gpu_blocks - num_required_blocks + < self.watermark_blocks): return AllocStatus.NEVER if num_free_gpu_blocks - num_required_blocks >= self.watermark_blocks: return AllocStatus.OK diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index b1630b34947bd..2bb961481e5fe 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -988,8 +988,8 @@ def _schedule_prefills( waiting_queue.popleft() continue - if (budget.num_batched_tokens >= - self.scheduler_config.max_num_batched_tokens): + if (budget.num_batched_tokens + >= self.scheduler_config.max_num_batched_tokens): # We've reached the budget limit - since there might be # continuous prefills in the running queue, we should break # to avoid scheduling any new prefills. @@ -1096,8 +1096,8 @@ def _schedule_default(self) -> SchedulerOutputs: running_scheduled.swapped_out) == 0: swapped_in = self._schedule_swapped(budget, curr_loras) - assert (budget.num_batched_tokens <= - self.scheduler_config.max_num_batched_tokens) + assert (budget.num_batched_tokens + <= self.scheduler_config.max_num_batched_tokens) assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs # Update waiting requests. @@ -1189,8 +1189,8 @@ def _schedule_chunked_prefill(self) -> SchedulerOutputs: curr_loras, enable_chunking=True) - assert (budget.num_batched_tokens <= - self.scheduler_config.max_num_batched_tokens) + assert (budget.num_batched_tokens + <= self.scheduler_config.max_num_batched_tokens) assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs # Update waiting requests. @@ -1358,8 +1358,8 @@ def schedule( # NOTE: We use get_len instead of get_prompt_len because when # a sequence is preempted, prefill includes previous generated # output tokens. - if (token_chunk_size + num_computed_tokens < - seqs[0].data.get_len()): + if (token_chunk_size + num_computed_tokens + < seqs[0].data.get_len()): do_sample = False # It assumes the scheduled_seq_groups is ordered by @@ -1625,10 +1625,9 @@ def _passed_delay(self, now: float) -> bool: if self.scheduler_config.delay_factor > 0 and self.waiting: earliest_arrival_time = min( [e.metrics.arrival_time for e in self.waiting]) - passed_delay = ( - (now - earliest_arrival_time) > - (self.scheduler_config.delay_factor * self.last_prompt_latency) - or not self.running) + passed_delay = ((now - earliest_arrival_time) + > (self.scheduler_config.delay_factor * + self.last_prompt_latency) or not self.running) else: passed_delay = True return passed_delay diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py index 4ced991f62f66..268edc0925fe8 100644 --- a/vllm/distributed/device_communicators/shm_broadcast.py +++ b/vllm/distributed/device_communicators/shm_broadcast.py @@ -352,8 +352,8 @@ def acquire_write(self, timeout: Optional[float] = None): sched_yield() # if we wait for a long time, log a message - if (time.monotonic() - start_time > - VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning): + if (time.monotonic() - start_time + > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning): logger.debug("No available block found in %s second. ", VLLM_RINGBUFFER_WARNING_INTERVAL) n_warning += 1 @@ -410,8 +410,8 @@ def acquire_read(self, timeout: Optional[float] = None): sched_yield() # if we wait for a long time, log a message - if (time.monotonic() - start_time > - VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning): + if (time.monotonic() - start_time + > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning): logger.debug("No available block found in %s second. ", VLLM_RINGBUFFER_WARNING_INTERVAL) n_warning += 1 diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index ffdf8b0f48087..7fe9b68d4b9e8 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -1014,8 +1014,8 @@ def initialize_model_parallel( backend = backend or torch.distributed.get_backend( get_world_group().device_group) - if (world_size != - tensor_model_parallel_size * pipeline_model_parallel_size): + if (world_size + != tensor_model_parallel_size * pipeline_model_parallel_size): raise RuntimeError( f"world_size ({world_size}) is not equal to " f"tensor_model_parallel_size ({tensor_model_parallel_size}) x " @@ -1069,8 +1069,8 @@ def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None: return if all([ - vllm_config.kv_transfer_config.need_kv_parallel_group, - _KV_TRANSFER is None + vllm_config.kv_transfer_config.need_kv_parallel_group, _KV_TRANSFER + is None ]): _KV_TRANSFER = kv_transfer.KVTransferAgent( rank=get_world_group().rank, diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index beedf5d16ab86..723d6e9085806 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -3,7 +3,7 @@ import json from abc import ABC, abstractmethod from collections import defaultdict, deque -from functools import lru_cache, partial +from functools import cache, lru_cache, partial from pathlib import Path from typing import (Any, Awaitable, Callable, Dict, Generic, Iterable, List, Literal, Optional, Tuple, TypeVar, Union, cast) @@ -377,7 +377,7 @@ def allowed_local_media_path(self): return self._model_config.allowed_local_media_path @staticmethod - @lru_cache(maxsize=None) + @cache def _cached_token_str(tokenizer: AnyTokenizer, token_index: int) -> str: return tokenizer.decode(token_index) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 2c9c20caf8119..b0179f78bd635 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -522,11 +522,10 @@ def _create_completion_logprobs( out_top_logprobs.append({ # Convert float("-inf") to the # JSON-serializable float that OpenAI uses - self._get_decoded_token( - top_lp[1], - top_lp[0], - tokenizer, - return_as_token_id=self.return_tokens_as_token_ids): + self._get_decoded_token(top_lp[1], + top_lp[0], + tokenizer, + return_as_token_id=self.return_tokens_as_token_ids): max(top_lp[1].logprob, -9999.0) for i, top_lp in enumerate(step_top_logprobs.items()) if num_output_top_logprobs >= i diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py index 94db8f379e33a..93e357e8b9f21 100644 --- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py @@ -62,8 +62,8 @@ def extract_tool_calls( start_of_json = match.end() # end_index == the start of the next function call # (if exists) - next_function_call_start = (matches[i + 1].start() - if i + 1 < len(matches) else None) + next_function_call_start = (matches[i + 1].start() if i + + 1 < len(matches) else None) raw_function_calls.append( dec.raw_decode( diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index e6f26d2b74b2f..cdd439d0385b6 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -220,8 +220,10 @@ def set_lora( lora_b.T, non_blocking=True) if embeddings_tensor is not None: self.embeddings_tensors[ - index, :embeddings_tensor.shape[0], :embeddings_tensor. - shape[1], ].copy_(embeddings_tensor, non_blocking=True) + index, + :embeddings_tensor.shape[0], + :embeddings_tensor.shape[1], + ].copy_(embeddings_tensor, non_blocking=True) if self.embeddings_slice is not None: # TODO(yard1): Optimize this copy, we don't need to copy # everything, just the modified part @@ -1024,8 +1026,10 @@ def set_lora( lora_b.T, non_blocking=True) if embeddings_tensor is not None: self.embeddings_tensors[ - index, :embeddings_tensor.shape[0], :embeddings_tensor. - shape[1], ] = embeddings_tensor + index, + :embeddings_tensor.shape[0], + :embeddings_tensor.shape[1], + ] = embeddings_tensor def _get_logits( self, diff --git a/vllm/lora/models.py b/vllm/lora/models.py index b77b6b3d72ff4..2e04cb902d009 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -75,8 +75,9 @@ def __init__( # Scaling factor for long context lora model. None if it is not # fine tuned for the long context. self.scaling_factor = scaling_factor - assert (lora_model_id > - 0), f"a valid lora id should be greater than 0, got {self.id}" + assert ( + lora_model_id + > 0), f"a valid lora id should be greater than 0, got {self.id}" self.rank = rank self.loras: Dict[str, LoRALayerWeights] = loras diff --git a/vllm/lora/ops/triton_ops/sgmv_expand.py b/vllm/lora/ops/triton_ops/sgmv_expand.py index 8af44b703810b..48fa5cd63741f 100644 --- a/vllm/lora/ops/triton_ops/sgmv_expand.py +++ b/vllm/lora/ops/triton_ops/sgmv_expand.py @@ -136,9 +136,8 @@ def _sgmv_expand_kernel( c_ptr = (out_ptr + offset_cm[:, None] * output_d0_stride + offset_cn[None, :] * output_d1_stride) M = tl.load(seq_lens + cur_batch) - c_mask = (offset_cm[:, None] < - (cur_seq_start + M)) & (offset_cn[None, :] < - (cur_slice_start + curr_N)) + c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & ( + offset_cn[None, :] < (cur_slice_start + curr_N)) if ADD_INPUTS: tiled_out = tl.load(c_ptr, mask=c_mask) tiled_c += tiled_out diff --git a/vllm/lora/ops/triton_ops/sgmv_shrink.py b/vllm/lora/ops/triton_ops/sgmv_shrink.py index 3d2ebe8286f56..9bb35e8ffd323 100644 --- a/vllm/lora/ops/triton_ops/sgmv_shrink.py +++ b/vllm/lora/ops/triton_ops/sgmv_shrink.py @@ -114,8 +114,8 @@ def _sgmv_shrink_kernel( slice_id * output_d0_stride) c_ptr = cur_out_ptr + offset_cm[:, None] * output_d1_stride + offset_cn[ None, :] * output_d2_stride - c_mask = (offset_cm[:, None] < - (cur_seq_start + M)) & (offset_cn[None, :] < N) + c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & (offset_cn[None, :] + < N) accumulator *= scaling # handles write-back with reduction-splitting if SPLIT_K == 1: diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py index b04612a9b00d9..915bdc4778929 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py @@ -73,12 +73,12 @@ def _transform_param(self, layer: torch.nn.Module, name: Optional[str], torch.nn.Parameter(new_param.data, requires_grad=False)) def _get_weight_params( - self, layer: torch.nn.Module - ) -> Tuple[torch.Tensor, # w_q - torch.Tensor, # w_s - Optional[torch.Tensor], # w_zp, - Optional[torch.Tensor] # w_gidx - ]: + self, layer: torch.nn.Module) -> Tuple[ + torch.Tensor, # w_q + torch.Tensor, # w_s + Optional[torch.Tensor], # w_zp, + Optional[torch.Tensor] # w_gidx + ]: return ( getattr(layer, self.w_q_name), getattr(layer, self.w_s_name), diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py index 75cf91f191136..c4a83b4faafe6 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py @@ -48,13 +48,13 @@ def apply_weights(self, raise NotImplementedError def _get_weight_params( - self, layer: torch.nn.Module - ) -> Tuple[torch.Tensor, # weight - torch.Tensor, # weight_scale - Optional[torch.Tensor], # input_scale, - Optional[torch.Tensor], # input_zp - Optional[torch.Tensor], # azp_adj - ]: + self, layer: torch.nn.Module) -> Tuple[ + torch.Tensor, # weight + torch.Tensor, # weight_scale + Optional[torch.Tensor], # input_scale, + Optional[torch.Tensor], # input_zp + Optional[torch.Tensor], # azp_adj + ]: return ( getattr(layer, self.w_q_name), getattr(layer, self.w_s_name), diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index b6882cc7c837c..43b1997019107 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -72,9 +72,10 @@ def block_quant_to_tensor_quant( x_dq_block = x_q_block.to(torch.float32) x_dq_block_tiles = [[ - x_dq_block[j * block_n:min((j + 1) * block_n, n), - i * block_k:min((i + 1) * block_k, k), ] - for i in range(k_tiles) + x_dq_block[ + j * block_n:min((j + 1) * block_n, n), + i * block_k:min((i + 1) * block_k, k), + ] for i in range(k_tiles) ] for j in range(n_tiles)] for i in range(k_tiles): diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index 7cdce67cf1677..9977804188a50 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -73,8 +73,8 @@ def requantize_with_max_scale( # from disk in this case. Skip requantization in this case (since) # we already are quantized with the single scale. # * Sample Model: nm-testing/Phi-3-mini-128k-instruct-FP8 - unfused_module_in_checkpoint = (weight_scale[-1] > torch.finfo( - torch.float8_e4m3fn).min) + unfused_module_in_checkpoint = (weight_scale[-1] + > torch.finfo(torch.float8_e4m3fn).min) # If unfused checkpoint, need requanize with the single scale. if unfused_module_in_checkpoint: diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index c2d12c466ba45..8dc26309d754e 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -716,9 +716,10 @@ def _sample_with_torch( tensors required for Pythonization ''' - categorized_seq_group_ids: Dict[SamplingType, - List[int]] = {t: [] - for t in SamplingType} + categorized_seq_group_ids: Dict[SamplingType, List[int]] = { + t: [] + for t in SamplingType + } categorized_sample_indices = sampling_metadata.categorized_sample_indices for i, seq_group in enumerate(sampling_metadata.seq_groups): sampling_params = seq_group.sampling_params diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index 3eb5c39ccf580..f230efacacdbb 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -115,17 +115,17 @@ def num_elements_padded(self) -> int: def __post_init__(self): # sanity checks - assert (self.padded_org_vocab_start_index <= - self.padded_org_vocab_end_index) - assert (self.padded_added_vocab_start_index <= - self.padded_added_vocab_end_index) + assert (self.padded_org_vocab_start_index + <= self.padded_org_vocab_end_index) + assert (self.padded_added_vocab_start_index + <= self.padded_added_vocab_end_index) assert self.org_vocab_start_index <= self.org_vocab_end_index assert self.added_vocab_start_index <= self.added_vocab_end_index assert self.org_vocab_start_index <= self.padded_org_vocab_start_index - assert (self.added_vocab_start_index <= - self.padded_added_vocab_start_index) + assert (self.added_vocab_start_index + <= self.padded_added_vocab_start_index) assert self.org_vocab_end_index <= self.padded_org_vocab_end_index assert self.added_vocab_end_index <= self.padded_added_vocab_end_index @@ -141,8 +141,8 @@ def get_masked_input_and_mask( added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]: # torch.compile will fuse all of the pointwise ops below # into a single kernel, making it very fast - org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ < - org_vocab_end_index) + org_vocab_mask = (input_ >= org_vocab_start_index) & ( + input_ < org_vocab_end_index) added_vocab_mask = (input_ >= added_vocab_start_index) & ( input_ < added_vocab_end_index) added_offset = added_vocab_start_index - ( diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 527b4307f3670..712266ee42639 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -1121,8 +1121,9 @@ def _load_weights(self, model_config: ModelConfig, # from being incorrectly identified as being present in # 'vpm.encoder.layers.0.self_attn.qkv_proj.weight shard_pos = quant_param_name.find(shard_name) - can_correct_rename = (shard_pos > 0) and ( - quant_param_name[shard_pos - 1] == ".") + can_correct_rename = (shard_pos + > 0) and (quant_param_name[shard_pos - 1] + == ".") # If the quant_param_name is packed, it won't occur in the # param_dict before renaming. new_quant_param_name = quant_param_name.replace( diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index e359aef9dcb7f..9266ca75ddaac 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -298,8 +298,8 @@ def _resize_lora_embeddings(self): to allow for adapter added tokens.""" for child in self.model.modules(): if (isinstance(child, VocabParallelEmbedding) - and child.weight.shape[0] < - child.num_embeddings_per_partition): + and child.weight.shape[0] + < child.num_embeddings_per_partition): new_weight = torch.empty(child.num_embeddings_per_partition, child.embedding_dim, dtype=child.weight.dtype, diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 6de0c866bc2f0..b23aba829c549 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Gemma model compatible with HuggingFace weights.""" -from functools import lru_cache +from functools import cache from typing import Iterable, List, Optional, Set, Tuple, Union import torch @@ -48,7 +48,7 @@ logger = init_logger(__name__) -@lru_cache(maxsize=None) +@cache def _get_gemma_act_fn( hidden_act: Optional[str], hidden_activation: Optional[str], diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index b518a0a6cbdee..cdf9414d5949c 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -429,10 +429,10 @@ def load_weights(self, weights: Iterable[Tuple[str, for e in range(p.size(0)): w1_name = n.replace( '.block_sparse_moe.input_linear.weight', - ".block_sparse_moe.experts.%d.w1.weight" % e) + f".block_sparse_moe.experts.{e}.w1.weight") w3_name = n.replace( '.block_sparse_moe.input_linear.weight', - ".block_sparse_moe.experts.%d.w3.weight" % e) + f".block_sparse_moe.experts.{e}.w3.weight") w1_param, w3_param = p[e].chunk(2, dim=0) assert w1_name not in new_weights assert w3_name not in new_weights @@ -442,7 +442,7 @@ def load_weights(self, weights: Iterable[Tuple[str, for e in range(p.size(0)): w2_name = n.replace( '.block_sparse_moe.output_linear.weight', - ".block_sparse_moe.experts.%d.w2.weight" % e) + f".block_sparse_moe.experts.{e}.w2.weight") w2_param = p[e] assert w2_name not in new_weights new_weights[w2_name] = w2_param diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index 61baa8e588d74..e15ac84a6049b 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -1365,8 +1365,8 @@ def forward( # For 1) text-only prefill and decode, 2) image-present decode. if image_inputs is None: full_text_row_masked_out_mask = ( - attn_metadata.encoder_seq_lens_tensor != 0).reshape(-1, 1).to( - input_ids.device) + attn_metadata.encoder_seq_lens_tensor + != 0).reshape(-1, 1).to(input_ids.device) skip_cross_attention = max(attn_metadata.encoder_seq_lens) == 0 # For image-present prefill. diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py index d49da5f29aa14..f1d796ca26a16 100644 --- a/vllm/model_executor/models/mlp_speculator.py +++ b/vllm/model_executor/models/mlp_speculator.py @@ -81,8 +81,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: if self.tie_weights: assert ( - self.n_predict > - 1), "You cannot tie weights between stages when only 1 exists" + self.n_predict > 1 + ), "You cannot tie weights between stages when only 1 exists" embedding = VocabParallelEmbedding( config.vocab_size, self.inner_dim, diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index 881c09ea9db99..6367b770a0aff 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -167,8 +167,8 @@ def sparsemixer(scores, jitter_eps=0.01): # compute mask for sparsity mask_logits_threshold, max_ind = scores.max(dim=-1, keepdim=True) factor = scores.abs().clamp(min=mask_logits_threshold) - mask_logits_threshold = ( - (mask_logits_threshold - scores) / factor) > (2 * jitter_eps) + mask_logits_threshold = ((mask_logits_threshold - scores) / + factor) > (2 * jitter_eps) # apply mask masked_gates = scores.masked_fill(mask_logits_threshold, float("-inf")) @@ -192,8 +192,8 @@ def sparsemixer(scores, jitter_eps=0.01): mask_logits_threshold, max_ind = masked_scores.max(dim=-1, keepdim=True) factor = scores.abs().clamp(min=mask_logits_threshold) - mask_logits_threshold = ( - (mask_logits_threshold - scores) / factor) > (2 * jitter_eps) + mask_logits_threshold = ((mask_logits_threshold - scores) / + factor) > (2 * jitter_eps) # apply mask masked_gates_top2 = masked_scores.masked_fill(mask_logits_threshold, diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 8d2719ca2d00d..8d71b19060bf4 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -462,7 +462,8 @@ def is_hybrid_model( ModelRegistry = _ModelRegistry({ - model_arch: _LazyRegisteredModel( + model_arch: + _LazyRegisteredModel( module_name=f"vllm.model_executor.models.{mod_relname}", class_name=cls_name, ) diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index d577e545a473b..605a0ecf4e0a9 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -333,10 +333,10 @@ def forward( return hidden_states -@MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor, - info=UltravoxProcessingInfo, - dummy_inputs=UltravoxDummyInputsBuilder - ) +@MULTIMODAL_REGISTRY.register_processor( + UltravoxMultiModalProcessor, + info=UltravoxProcessingInfo, + dummy_inputs=UltravoxDummyInputsBuilder) class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP): hf_to_vllm_mapper = WeightsMapper( diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 43b3c973c97b8..01a232fdc76de 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -599,9 +599,8 @@ def make_empty_intermediate_tensors( device: torch.device, ) -> IntermediateTensors: return IntermediateTensors({ - key: torch.zeros((batch_size, hidden_size), - dtype=dtype, - device=device) + key: + torch.zeros((batch_size, hidden_size), dtype=dtype, device=device) for key in keys }) diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 1df8f84ed4093..61e8881b64f5d 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -166,7 +166,8 @@ def prepare( pin_memory=pin_memory, ) categorized_sample_indices = { - t: async_tensor_h2d( + t: + async_tensor_h2d( seq_ids, dtype=torch.int, target_device=device, @@ -198,8 +199,12 @@ def _prepare_seq_groups( device: str, generators: Optional[Dict[str, torch.Generator]] = None, cache: Optional[SamplingMetadataCache] = None, -) -> Tuple[List[SequenceGroupToSample], List[int], Dict[SamplingType, - List[int]], int, ]: +) -> Tuple[ + List[SequenceGroupToSample], + List[int], + Dict[SamplingType, List[int]], + int, +]: """Prepare sequence groups and indices for sampling. Args: diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py index ead3dab05a6b1..23a7126fb05cf 100644 --- a/vllm/platforms/neuron.py +++ b/vllm/platforms/neuron.py @@ -38,8 +38,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: if parallel_config.world_size > 1: parallel_config.distributed_executor_backend = "uni" - assert (vllm_config.lora_config is - None), "LoRA is not supported for Neuron backend." + assert (vllm_config.lora_config + is None), "LoRA is not supported for Neuron backend." assert (not vllm_config.speculative_config ), "Speculative decoding not yet supported for Neuron backend." diff --git a/vllm/scalar_type.py b/vllm/scalar_type.py index 9d711b0debcd8..20063a5b4b085 100644 --- a/vllm/scalar_type.py +++ b/vllm/scalar_type.py @@ -121,8 +121,8 @@ def _raw_min(self) -> Union[int, float]: min_raw = max_raw | sign_bit_double return struct.unpack('!d', struct.pack('!Q', min_raw))[0] else: - assert (not self.is_signed() or - self.size_bits <= 64), "Cannot represent min as a int64_t" + assert (not self.is_signed() or self.size_bits + <= 64), "Cannot represent min as a int64_t" if self.is_signed(): return -(1 << (self.size_bits - 1)) diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 8e9802c7d333c..af1c4dfcebbc0 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -510,8 +510,8 @@ def _should_disable_all_speculation( self, execute_model_req: ExecuteModelRequest) -> bool: # When the batch size is too large, disable speculative decoding # to stop trading off throughput for latency. - return (execute_model_req.running_queue_size >= - self.disable_by_batch_size) + return (execute_model_req.running_queue_size + >= self.disable_by_batch_size) def _maybe_disable_speculative_tokens( self, disable_all_speculation: bool, diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py index 5a7999a258b2d..6bf7587cdda19 100644 --- a/vllm/spec_decode/top1_proposer.py +++ b/vllm/spec_decode/top1_proposer.py @@ -104,11 +104,11 @@ def get_spec_proposals( sampler_transposed=transposed, ) - proposals = SpeculativeProposals( - proposal_token_ids=proposal_tokens, - proposal_probs=proposal_probs, - proposal_lens=proposal_lens, - no_proposals=maybe_sampler_output is None) + proposals = SpeculativeProposals(proposal_token_ids=proposal_tokens, + proposal_probs=proposal_probs, + proposal_lens=proposal_lens, + no_proposals=maybe_sampler_output + is None) return proposals def _split_by_proposal_len( diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py index da8706658d09a..c88820ab27b69 100644 --- a/vllm/spec_decode/util.py +++ b/vllm/spec_decode/util.py @@ -40,13 +40,15 @@ def get_sampled_token_logprobs( """ num_steps, batch_size, vocab_size = logprob_tensor.shape - selected_logprobs = logprob_tensor[torch.arange(num_steps).unsqueeze(1), - torch.arange(batch_size), - sampled_token_ids, ] + selected_logprobs = logprob_tensor[ + torch.arange(num_steps).unsqueeze(1), + torch.arange(batch_size), + sampled_token_ids, + ] expanded_selected_logprobs = selected_logprobs.unsqueeze(-1).expand( -1, -1, vocab_size) - sampled_token_ids_ranks = (logprob_tensor > - expanded_selected_logprobs).sum(-1).add_(1) + sampled_token_ids_ranks = (logprob_tensor + > expanded_selected_logprobs).sum(-1).add_(1) return sampled_token_ids_ranks, selected_logprobs diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py index 93fec667d1cf3..1edf36329d83b 100644 --- a/vllm/transformers_utils/configs/nemotron.py +++ b/vllm/transformers_utils/configs/nemotron.py @@ -182,8 +182,8 @@ def _rope_scaling_validation(self): if self.rope_scaling is None: return - if not isinstance(self.rope_scaling, - dict) or len(self.rope_scaling) != 2: + if not isinstance(self.rope_scaling, dict) or len( + self.rope_scaling) != 2: raise ValueError( "`rope_scaling` must be a dictionary with two fields, " f"`type` and `factor`, got {self.rope_scaling}") diff --git a/vllm/utils.py b/vllm/utils.py index 17bffd2846b46..15481fb06e08e 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -29,7 +29,7 @@ from collections import OrderedDict, UserDict, defaultdict from collections.abc import Hashable, Iterable, Mapping from dataclasses import dataclass, field -from functools import lru_cache, partial, wraps +from functools import cache, lru_cache, partial, wraps from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable, Dict, Generator, Generic, Iterator, List, Literal, NamedTuple, Optional, Tuple, Type, TypeVar, Union, @@ -352,7 +352,7 @@ def reset(self): self._index = 0 -@lru_cache(maxsize=None) +@cache def get_max_shared_memory_bytes(gpu: int = 0) -> int: """Returns the maximum shared memory per thread block in bytes.""" from vllm import _custom_ops as ops @@ -697,7 +697,7 @@ def create_kv_caches_with_random( return key_caches, value_caches -@lru_cache(maxsize=None) +@cache def is_pin_memory_available() -> bool: from vllm.platforms import current_platform return current_platform.is_pin_memory_available() @@ -886,7 +886,7 @@ def init_cached_hf_modules() -> None: init_hf_modules() -@lru_cache(maxsize=None) +@cache def find_library(lib_name: str) -> str: """ Find the library file in the system. @@ -1607,7 +1607,7 @@ def import_from_path(module_name: str, file_path: Union[str, os.PathLike]): return module -@lru_cache(maxsize=None) +@cache def get_vllm_optional_dependencies(): metadata = importlib.metadata.metadata("vllm") requirements = metadata.get_all("Requires-Dist", []) diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index de7fb1a698df6..7a88cc9433b32 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -247,8 +247,8 @@ def schedule(self) -> "SchedulerOutput": token_budget -= num_new_tokens request.status = RequestStatus.RUNNING request.num_computed_tokens = num_computed_tokens - has_partial_request = (num_computed_tokens + num_new_tokens < - request.num_tokens) + has_partial_request = (num_computed_tokens + num_new_tokens + < request.num_tokens) # Encoder-related. if encoder_inputs_to_schedule: diff --git a/vllm/v1/stats/common.py b/vllm/v1/stats/common.py index 500bc356fc179..902800e0573bf 100644 --- a/vllm/v1/stats/common.py +++ b/vllm/v1/stats/common.py @@ -311,8 +311,8 @@ def output_token_latency_s_lst(self) -> List[float]: return [] latency_s_lst = [] for i in range(1, len(self.output_token_ts_s_lst)): - assert (self.output_token_ts_s_lst[i] >= - self.output_token_ts_s_lst[i - 1]) + assert (self.output_token_ts_s_lst[i] + >= self.output_token_ts_s_lst[i - 1]) latency_s = (self.output_token_ts_s_lst[i] - self.output_token_ts_s_lst[i - 1]) latency_s_lst.append(latency_s) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 9d7e30079dfbb..a00c00c307335 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -205,7 +205,7 @@ def __init__( def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # Remove stopped requests from the cached states. - # Keep the states of the pre-empted requests. + # Keep the states of the preempted requests. for req_id in scheduler_output.finished_req_ids: self.requests.pop(req_id, None) self.encoder_cache.pop(req_id, None) diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index 3c570212625c4..aaf9cb40bf2aa 100644 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -173,13 +173,13 @@ def execute_model( cpu_fallback_ctx as cpu_fallback_local_metric: output = LocalOrDistributedWorkerBase.execute_model( self, execute_model_req) - if (log_graph_compilation and gc_local_metric.stats()[0][1] > 0 - ) or log_graph_compilation_all: + if (log_graph_compilation and gc_local_metric.stats()[0][1] + > 0) or log_graph_compilation_all: msg = ("VLLM_HPU_STEP_GRAPH_COMPILATION: " f"{gc_local_metric.stats()}, {input_stats}") logger.warning(msg) - if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1] > - 0) or log_cpu_fallbacks_all: + if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1] + > 0) or log_cpu_fallbacks_all: msg = ("VLLM_HPU_STEP_CPU_FALLBACK: " f"{cpu_fallback_local_metric.stats()}, {input_stats}") logger.warning(msg) diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index a3f648f4cc645..8749518284288 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -316,8 +316,8 @@ def warmup_model( logger.info("batch_size: %d, seq_len: %d", batch_size, seq_len) num_tokens = batch_size * seq_len - if (num_tokens >= - self.scheduler_config.max_num_batched_tokens): + if (num_tokens + >= self.scheduler_config.max_num_batched_tokens): break seq_len = seq_len * 2 end = time.time() From ddee88d0ff2757bdef98a83a9c78af1ea4559758 Mon Sep 17 00:00:00 2001 From: Liangfu Chen Date: Mon, 27 Jan 2025 17:31:16 -0800 Subject: [PATCH 10/69] [Neuron][Kernel] NKI-based flash-attention kernel with paged KV cache (#11277) Signed-off-by: Liangfu Chen Co-authored-by: Jiangfei Duan --- .buildkite/run-neuron-test.sh | 2 +- tests/neuron/test_prefix_prefill.py | 456 ++++++++++++++++++ vllm/attention/ops/nki_flash_attn.py | 669 +++++++++++++++++++++++++++ 3 files changed, 1126 insertions(+), 1 deletion(-) create mode 100644 tests/neuron/test_prefix_prefill.py create mode 100644 vllm/attention/ops/nki_flash_attn.py diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh index 0590dad4f311f..1ad77cf50f612 100644 --- a/.buildkite/run-neuron-test.sh +++ b/.buildkite/run-neuron-test.sh @@ -54,4 +54,4 @@ docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \ -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \ --name "${container_name}" \ ${image_name} \ - /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py" + /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/ -v --capture=tee-sys" diff --git a/tests/neuron/test_prefix_prefill.py b/tests/neuron/test_prefix_prefill.py new file mode 100644 index 0000000000000..77b707a737118 --- /dev/null +++ b/tests/neuron/test_prefix_prefill.py @@ -0,0 +1,456 @@ +import random +from typing import Optional + +import pytest +import torch +import torch.nn.functional as F + + +class BlockDiagonalCausalFromBottomRightMask: + + @staticmethod + def _from_seqlens(query_lens, seq_lens, block_size=None): + from torch import logical_and, logical_or + + contexted = block_size is None + context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens) + n_queries = sum(query_lens) + num_seqs = len(query_lens) + if contexted: + key_lens_blockaligned = seq_lens + else: + n_blocks_per_seq = (context_lens + block_size - 1) // block_size + offset_per_seq = n_blocks_per_seq * block_size + key_lens_blockaligned = offset_per_seq[:num_seqs].tolist() + n_keys = sum(key_lens_blockaligned) + + a = (torch.arange(n_queries).reshape(n_queries, + 1).expand(n_queries, n_keys)) + b = torch.arange(n_keys).reshape(1, n_keys).expand(n_queries, n_keys) + q_cumsum = torch.tensor([0] + query_lens).cumsum(dim=0) + k_cumsum = torch.tensor([0] + key_lens_blockaligned).cumsum(dim=0) + + prior_mask = torch.zeros(n_queries, n_keys) + new_masks: list[torch.Tensor] = [] + for seq_id in range(num_seqs): + ri = q_cumsum[seq_id] + ci = k_cumsum[seq_id] + nr = query_lens[seq_id] + + if contexted: + nc = seq_lens[seq_id] + a_offset = ci + nc - ri - nr + new_mask = (a + a_offset) >= b + else: + nc = context_lens[seq_id] + a_offset = ci + nc - 1 + new_mask = a_offset >= b + + left_mask = b >= ci + top_mask = a >= ri + bottom_mask = a < (ri + nr) + + new_mask = logical_and( + logical_and(logical_and(new_mask, left_mask), top_mask), + bottom_mask, + ) + prior_mask = logical_or(prior_mask, new_mask) + new_masks = new_masks + [new_mask] + return prior_mask + + @staticmethod + def from_seqlens(query_lens, seq_lens, block_size=None): + contexted = block_size is None + if contexted: + prior_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens( + query_lens, seq_lens) + active_mask = None + else: + prior_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens( + query_lens, seq_lens, block_size) + active_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens( + query_lens, query_lens) + return prior_mask, active_mask + + +def ref_softmax(x: torch.Tensor, + dim: int, + mixed_precision=False, + return_max_reduce=False): + max_value = torch.amax(x, dim=dim, keepdims=True) + exp = torch.exp(x - max_value) + if mixed_precision: + sum_value = torch.sum(exp.astype(torch.float32), + dim=dim, + keepdims=True).astype(x.dtype) + else: + sum_value = torch.sum(exp, dim=dim, keepdims=True) + if return_max_reduce: + return exp / sum_value, max_value, torch.reciprocal(sum_value) + return exp / sum_value + + +def ref_masked_attention( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + scale: float, + attn_mask: Optional[torch.Tensor] = None, + return_max_reduce: Optional[bool] = False, +) -> torch.Tensor: + scaled_qk = scale * torch.einsum("qhd,khd->hqk", query, key).float() + if attn_mask is not None: + masked_score = scaled_qk + attn_mask.float() + if return_max_reduce: + norm_score, cached_max, cached_sum_reciprocal = ref_softmax( + masked_score, dim=-1, return_max_reduce=True) + else: + norm_score = ref_softmax(masked_score, dim=-1) + out = torch.einsum("hqk,khd->qhd", norm_score, value) + if return_max_reduce: + return ( + out, + cached_max, + cached_sum_reciprocal, + norm_score, + masked_score, + scaled_qk, + ) + else: + return out + + +def ref_context_attention( + query, + key, + value, + query_lens, + seq_lens, + head_size, + num_kv_heads, + num_heads, + num_queries_per_kv, + return_max_reduce=False, +): + scale = float(1.0 / (head_size**0.5)) + if num_queries_per_kv > 1: + # Handle MQA and GQA + key = torch.repeat_interleave(key, num_queries_per_kv, dim=1) + value = torch.repeat_interleave(value, num_queries_per_kv, dim=1) + + attn_mask, _ = BlockDiagonalCausalFromBottomRightMask.from_seqlens( + query_lens, seq_lens) + + # convert binary mask to -inf values + attn_mask = torch.logical_not(attn_mask) + attn_mask = attn_mask.float() * -30000 + + output, cached_max, cached_sum_reciprocal, lse, masked_score, scaled_qk = ( + ref_masked_attention( + query, + key, + value, + scale, + attn_mask, + return_max_reduce=return_max_reduce, + )) + + output = output.unsqueeze(1) + if return_max_reduce: + return ( + output, + cached_max, + cached_sum_reciprocal, + lse, + masked_score, + scaled_qk, + ) + else: + return output + + +@pytest.mark.parametrize( + "num_heads,num_queries_per_kv,head_size,mixed_precision", + [ + (4, 2, 8, False), + (4, 2, 8, True), + (32, 8, 64, True), + ], +) +@torch.inference_mode() +def test_contexted_kv_attention( + num_heads: int, + num_queries_per_kv: int, + head_size: int, + mixed_precision: bool, +) -> None: + import os + + import torch_xla.core.xla_model as xm + + from vllm.attention.ops.nki_flash_attn import flash_attn_varlen_nkifunc + + device = xm.xla_device() + + os.environ["NEURON_CC_FLAGS"] = ( + " --model-type=transformer -O1 " + " --internal-hlo2tensorizer-options='--verify-hlo' ") + + random.seed(0) + torch.manual_seed(0) + torch.set_printoptions(sci_mode=False) + + min_ctx_len = 2 + max_ctx_len = 64 + min_query_len = 2 + max_query_len = 64 + prefill_batch_size = 2 + decode_batch_size = 6 + batch_size = prefill_batch_size + decode_batch_size + block_size = 32 + max_model_len = (max_query_len + max_ctx_len) * 4 + + max_block_per_request = max_model_len // block_size + dtype = torch.float32 + cache_size = (batch_size * max_block_per_request) + 2 + ctx_lens = [ + random.randint(min_ctx_len, max_ctx_len) + for _ in range(prefill_batch_size) + ] + [ + random.randint(min_ctx_len, max_ctx_len) + for _ in range(decode_batch_size) + ] + query_lens = [ + random.randint(min_query_len, max_query_len) + for _ in range(prefill_batch_size) + ] + [1 for _ in range(decode_batch_size)] + seq_lens = [a + b for a, b in zip(query_lens, ctx_lens)] + num_kv_heads = num_heads // num_queries_per_kv + + num_tokens = sum(query_lens) + query = torch.empty(num_tokens, num_heads, head_size, dtype=dtype) + query.uniform_(-1, 1) + torch.empty(num_tokens, num_heads, head_size, dtype=dtype) + + kv = torch.empty(sum(seq_lens), 2, num_kv_heads, head_size, dtype=dtype) + kv.uniform_(-1, 1) + key, value = kv.unbind(dim=1) + + k_cache = torch.zeros(cache_size, + block_size, + num_kv_heads, + head_size, + dtype=dtype) + v_cache = torch.zeros(cache_size, + block_size, + num_kv_heads, + head_size, + dtype=dtype) + k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype) + v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype) + values = torch.arange(0, cache_size, dtype=torch.long) + values = values[torch.randperm(cache_size)] + block_table = values[:batch_size * max_block_per_request].view( + batch_size, max_block_per_request) + torch.tensor(seq_lens, dtype=torch.long) + b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long) + b_start_loc = torch.cumsum(torch.tensor([0] + query_lens[:-1], + dtype=torch.long), + dim=0) + # copy kv to cache + b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1], + dtype=torch.long), + dim=0) + for i in range(batch_size): + for j in range(query_lens[i]): + k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] + + j]) + v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] + + b_ctx_len[i] + j]) + cur_ctx = 0 + block_id = 0 + while cur_ctx < b_ctx_len[i]: + start_loc = b_seq_start_loc[i] + cur_ctx + if cur_ctx + block_size > b_ctx_len[i]: + end_loc = b_seq_start_loc[i] + b_ctx_len[i] + else: + end_loc = start_loc + block_size + start_slot = block_table[i, block_id] * block_size + end_slot = start_slot + end_loc - start_loc + k_cache.view(-1, num_kv_heads, + head_size)[start_slot:end_slot].copy_( + key[start_loc:end_loc]) + v_cache.view(-1, num_kv_heads, + head_size)[start_slot:end_slot].copy_( + value[start_loc:end_loc]) + cur_ctx += block_size + block_id += 1 + + ( + output_ref, + cached_max, + cached_sum_reciprocal, + lse, + masked_score, + scaled_qk, + ) = ref_context_attention( + query, + key, + value, + query_lens, + seq_lens, + head_size, + num_kv_heads, + num_heads, + num_queries_per_kv, + return_max_reduce=True, + ) + + # build neuron program + return_debug_tensors = False + B_P_SIZE = 128 + LARGE_TILE_SZ = 2048 + max_num_queries = ( + (sum(query_lens) + block_size - 1) // block_size) * block_size + + def get_active_block_tables(block_tables, query_lens, seq_lens, block_size, + num_blocks): + context_lens = seq_lens - query_lens + blocks_per_seq = (context_lens + block_size - 1) // block_size + num_seqs = len(seq_lens) + active_blocks: list[int] = [] + for seq_id in range(num_seqs): + active_blocks = ( + active_blocks + + block_tables[seq_id, :blocks_per_seq[seq_id]].tolist()) + return F.pad( + torch.tensor(active_blocks), + (0, num_blocks - len(active_blocks)), + "constant", + 0, + ) + + def shift_bit_length(x): + return 1 << (x - 1).bit_length() + + # calculate input shapes + max_num_queries_shifted = shift_bit_length(max_num_queries) + max_num_queries_factor = B_P_SIZE // max_num_queries_shifted + max_num_queries_padded = max_num_queries_shifted * max_num_queries_factor + assert (max_num_queries_padded == B_P_SIZE + ), "invalid {max_num_queries_padded=}" + head_size_padded = B_P_SIZE + context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens) + num_active_blocks_shifted = shift_bit_length( + ((context_lens + block_size - 1) // block_size).sum().item()) + num_active_blocks_factor = (LARGE_TILE_SZ // block_size // + num_active_blocks_shifted) + num_active_blocks = num_active_blocks_shifted * num_active_blocks_factor + assert (num_active_blocks * + block_size) == LARGE_TILE_SZ, "invalid {num_active_blocks=}" + context_kv_len = num_active_blocks * block_size + assert context_kv_len == LARGE_TILE_SZ, f"invalid {context_kv_len=}" + + # pad QKV tensors + pad_dims = ( + 0, + head_size_padded - query.shape[2], + 0, + 0, + 0, + max_num_queries_padded - query.shape[0], + ) + query = F.pad(query, pad_dims, "constant", 0) + k = F.pad(k, pad_dims, "constant", 0) + v = F.pad(v, pad_dims, "constant", 0) + k_cache = F.pad(k_cache, (0, head_size_padded - head_size), "constant", 0) + v_cache = F.pad(v_cache, (0, head_size_padded - head_size), "constant", 0) + + # permute QKV tensors + # query: (1, n_heads, d, seq_q) + # key: (1, n_kv_heads, d, seq_k) + # value: (1, n_kv_heads, seq_v, d) + query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous() + k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous() + v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous() + + # transform block table + active_block_table = get_active_block_tables( + block_table, + torch.tensor(query_lens), + torch.tensor(seq_lens), + block_size, + num_active_blocks, + ) + + # Build attention masks + prior_mask, active_mask = ( + BlockDiagonalCausalFromBottomRightMask.from_seqlens( + query_lens, seq_lens, block_size=block_size)) + attn_mask = torch.concat( + [ + F.pad( + prior_mask, + ( + 0, + context_kv_len - prior_mask.shape[1], + 0, + B_P_SIZE - prior_mask.shape[0], + ), + "constant", + 0, + ).bool(), + F.pad( + active_mask, + ( + 0, + B_P_SIZE - active_mask.shape[1], + 0, + B_P_SIZE - active_mask.shape[0], + ), + "constant", + 0, + ).bool(), + ], + dim=1, + ) + + input_args = ( + query.to(device=device), + k.to(device=device), + v.to(device=device), + k_cache.to(device=device), + v_cache.to(device=device), + active_block_table.to(torch.int32).to(device=device), + attn_mask.to(device=device), + ) + input_kwargs = dict( + n_kv_head=num_kv_heads, + head_size=head_size, + mixed_precision=mixed_precision, + ) + + if return_debug_tensors: + output_nki, *debug_tensors = flash_attn_varlen_nkifunc( + *input_args, **input_kwargs) + else: + output_nki = flash_attn_varlen_nkifunc(*input_args, **input_kwargs) + debug_tensors = [] + + output_nki = torch.tensor(output_nki).cpu() + debug_tensors = [torch.tensor(dt).cpu() for dt in debug_tensors] + + num_actual_tokens = sum(query_lens) + print(f"{num_actual_tokens=}") + # - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d) + output_nki = output_nki.permute( + 0, 2, 1, 3)[:, :, :, :head_size].cpu()[0, :num_actual_tokens, :, :] + output_ref_padded = F.pad( + output_ref, + (0, 0, 0, 0, 0, 0, 0, max_num_queries_padded - output_ref.shape[0]), + "constant", + 0, + ) + output_ref = output_ref_padded.transpose(0, 1)[0, :num_actual_tokens, :, :] + + torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0) diff --git a/vllm/attention/ops/nki_flash_attn.py b/vllm/attention/ops/nki_flash_attn.py new file mode 100644 index 0000000000000..b9765b0f0283d --- /dev/null +++ b/vllm/attention/ops/nki_flash_attn.py @@ -0,0 +1,669 @@ +from dataclasses import dataclass + +import neuronxcc.nki.isa as nisa +import neuronxcc.nki.language as nl +import numpy as np +from neuronxcc import nki +from neuronxcc.nki.language import par_dim + + +@dataclass(frozen=True) +class FlashConfig: + """ + Config class for flash attention with default values + """ + + seq_tile_size: int = 2048 + should_transpose_v: bool = False + + __annotations__ = { + "seq_tile_size": int, + "should_transpose_v": bool, + } + + +@nki.jit +def transpose_p_local(p_local_transposed, + p_local, + LARGE_TILE_SZ, + forward_mask, + B_F_SIZE=512): + for i in nl.affine_range(LARGE_TILE_SZ // B_F_SIZE): + if nisa.get_nc_version() == nisa.nc_version.gen3: + p_local_t_tmp = nl.ndarray((par_dim(128), B_F_SIZE), + buffer=nl.sbuf, + dtype=p_local.dtype) + else: + p_local_t_tmp = nl.ndarray((par_dim(128), B_F_SIZE), + buffer=nl.psum, + dtype=np.float32) + + for j in nl.affine_range(B_F_SIZE // 128): + j_128_slice = nl.ds(j * 128, 128) + i_j_128_slice = nl.ds(i * B_F_SIZE + j * 128, 128) + + if nisa.get_nc_version() == nisa.nc_version.gen3: + p_local_t_tmp[:, j_128_slice] = nisa.dma_transpose( + p_local[:, i_j_128_slice], mask=forward_mask) + else: + p_local_t_tmp[:, j_128_slice] = nisa.nc_transpose( + p_local[:, i_j_128_slice], mask=forward_mask) + + p_local_transposed[:, nl.ds(i * B_F_SIZE, B_F_SIZE)] = nl.copy( + p_local_t_tmp, dtype=p_local_transposed.dtype, mask=forward_mask) + + +@nki.jit +def _flash_attention_core( + q_local_tile, + k, + v, + q_h_per_k_h, + seqlen_q, + nheads, + o_buffer, + l_buffer, + m_buffer, + batch_id, + head_id, + gqa_head_idx, + q_tile_idx, + local_k_large_tile_idx, + kernel_dtype, + acc_type, + flash_config: FlashConfig, + use_causal_mask=False, + continuous_batching_mask=None, + initialize=False, + B_P_SIZE=128, + B_F_SIZE=512, + B_D_SIZE=128, + dropout_p=0.0, + dropout_p_tensor=None, + seed_tensor=None, + logit_bias_tile=None, + qk_res_buffer=None, +): + """ + The flash attention core function to calculate self attention between a tile + of q and a block of K and V. + The q_local_tile has (B_P_SIZE, B_F_SIZE), which is loaded into the SBUF + already. The block size of K and V + is defined in the seq_tile_size of the flash_config. The results are stored + in the following three buffers + o_buffer: (B_P_SIZE, d) + l_buffer: (B_P_SIZE, 1) + m_buffer: (B_P_SIZE, 1) + """ + LARGE_TILE_SZ = flash_config.seq_tile_size + num_k_tile_per_large_tile = LARGE_TILE_SZ // B_F_SIZE + seqlen_k = k.shape[-1] + seqlen_q // B_P_SIZE + seqlen_k // B_F_SIZE + + # TODO : support logit_bias with continuous_batching_mask + assert not use_causal_mask, "causal mask is not supported." + assert (continuous_batching_mask + is not None), "continuous_batching_mask input is required." + if continuous_batching_mask is not None: + assert (logit_bias_tile is + None), "continuous_batching_mask does not support logit_bias!" + + # mask are used to only apply computation to the lower half of the matrix, + # which reduce the arthimetic intensity by half + forward_mask = (q_tile_idx * B_P_SIZE >= local_k_large_tile_idx * + LARGE_TILE_SZ if use_causal_mask else None) + + qk_res_buf = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ), + buffer=nl.sbuf, + dtype=acc_type) + max_local = nl.ndarray((par_dim(B_P_SIZE), num_k_tile_per_large_tile), + dtype=acc_type) + for k_i in nl.affine_range(num_k_tile_per_large_tile): + k_i_b_f_slice = nl.ds(k_i * B_F_SIZE, B_F_SIZE) + + qk_psum = nl.zeros((par_dim(B_P_SIZE), B_F_SIZE), + dtype=np.float32, + buffer=nl.psum) # (128, 512) + qk_psum[:, :] = nl.matmul(q_local_tile, + k[:, k_i_b_f_slice], + transpose_x=True, + mask=None) # (p(128), 512) + + qk_res_buf[:, k_i_b_f_slice] = nl.where( + continuous_batching_mask[:, k_i_b_f_slice], + qk_psum[:, nl.ds(0, B_F_SIZE)], + -9984.0, + dtype=acc_type, + ) + + # Calculate max of the current tile + max_local[:, k_i] = nisa.tensor_reduce( + np.max, + qk_res_buf[:, k_i_b_f_slice], + axis=(1, ), + dtype=acc_type, + negate=False, + mask=forward_mask, + ) + + if qk_res_buffer is not None: + qk_res_buffer[:, :] = nl.copy(qk_res_buf[:, :]) + + max_ = nisa.tensor_reduce( + np.max, + max_local[:, :], + axis=(1, ), + dtype=acc_type, + negate=False, + mask=forward_mask, + ) + + o_previous_scaled = nl.ndarray((par_dim(B_P_SIZE), B_D_SIZE), + dtype=o_buffer.dtype) + + if initialize: + m_buffer[:, 0] = nl.copy(max_) + m_current = max_ + else: + m_previous = nl.copy(m_buffer[:, 0]) + m_buffer[:, 0] = nl.maximum(m_previous, max_, + mask=forward_mask) # (128,1) + + m_current = m_buffer[:, 0] + # Compute scaling factor + alpha = nisa.activation( + np.exp, + m_previous, + bias=-1 * m_current, + scale=1.0, + mask=forward_mask, + ) + o_previous_scaled[...] = nl.multiply(o_buffer[:, :], + alpha, + mask=forward_mask) + + p_local = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ), + dtype=kernel_dtype) + REDUCTION_TILE = min(2048, LARGE_TILE_SZ // 2) + + p_partial_sum = nl.ndarray( + (par_dim(B_P_SIZE), LARGE_TILE_SZ // REDUCTION_TILE), dtype=acc_type) + + for k_r_i in nl.affine_range(LARGE_TILE_SZ // REDUCTION_TILE): + k_r_i_reduce_slice = nl.ds(k_r_i * REDUCTION_TILE, REDUCTION_TILE) + + # compute exp(qk - max) + # Compute partial row - tile sum of exp(qk - max)) + # FIXME : Use activation accumulate to accumulate over k_r_i loop ? + p_local[:, k_r_i_reduce_slice] = nisa.activation_reduce( + np.exp, + qk_res_buf[:, k_r_i_reduce_slice], + bias=-1 * m_current, + scale=1.0, + reduce_op=nl.add, + reduce_res=p_partial_sum[:, k_r_i], + dtype=kernel_dtype, + mask=forward_mask, + ) + + ps = nl.sum(p_partial_sum, axis=1, dtype=acc_type, mask=forward_mask) + + p_local_transposed = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ), + dtype=kernel_dtype) + transpose_p_local( + p_local_transposed=p_local_transposed, + p_local=p_local, + LARGE_TILE_SZ=LARGE_TILE_SZ, + forward_mask=forward_mask, + B_F_SIZE=B_F_SIZE, + ) + + pv_psum = nl.zeros((par_dim(B_P_SIZE), B_D_SIZE), + dtype=np.float32, + buffer=nl.psum) + for k_i in nl.affine_range(LARGE_TILE_SZ // B_P_SIZE): + pv_psum[:, :] += nl.matmul( + p_local_transposed[:, nl.ds(k_i * B_P_SIZE, B_P_SIZE)], + v[k_i, :, :], + transpose_x=True, + mask=forward_mask, + ) # (128, 128) (p(Br), d) + + if initialize: + o_buffer[:, :] = nl.copy(pv_psum[:, :]) + l_buffer[:, 0] = nl.add(nl.log(ps), max_) + else: + o_buffer[:, :] = nl.add(o_previous_scaled, pv_psum, mask=forward_mask) + + l_prev = l_buffer[:, 0] + l_exp = nl.add( + nl.exp( + nl.subtract(l_prev, m_current, mask=forward_mask), + mask=forward_mask, + ), + ps, + mask=forward_mask, + ) + l_buffer[:, 0] = nl.add(m_current, + nl.log(l_exp, mask=forward_mask), + mask=forward_mask) + + +@nki.jit +def load_v_tile(v_hbm_tile, cur_v_tile, j, v_i, config): + LARGE_TILE_SZ = config.seq_tile_size + B_P_SIZE = 128 + + if not config.should_transpose_v: + cur_v_tile[v_i, :, :] = nl.load( + v_hbm_tile[nl.ds(j * LARGE_TILE_SZ + B_P_SIZE * v_i, B_P_SIZE), :], + dtype=cur_v_tile.dtype, + ) + return + + if nisa.get_nc_version() == nisa.nc_version.gen3: + cur_v_tile_transposed = nisa.dma_transpose( + v_hbm_tile[:, + nl.ds(j * LARGE_TILE_SZ + B_P_SIZE * v_i, B_P_SIZE)]) + cur_v_tile[v_i, :, :] = nisa.tensor_copy(cur_v_tile_transposed, + dtype=cur_v_tile.dtype) + return + + cur_v_tile[v_i, :, :] = nl.load_transpose2d( + v_hbm_tile[:, nl.ds(j * LARGE_TILE_SZ + B_P_SIZE * v_i, B_P_SIZE)], + dtype=cur_v_tile.dtype, + ) + + +@nki.jit +def flash_paged_attention( + query, + key, + value, + key_cache, + value_cache, + block_tables, + mask, + softmax_scale=None, + mixed_precision=True, + config=None, + return_debug_tensors=False, +): + """ + Flash PagedAttention Forward Kernel. + - PagedAttention Paper: https://arxiv.org/abs/2309.06180 + - Chunked Prefill Paper: https://arxiv.org/abs/2403.02310 + + IO tensor layouts: + - query: shape (1, n_heads, d, seq_q) + - key: shape (1, n_kv_heads, d, seq_k) + - value: shape (1, n_kv_heads, seq_v, d) + - key_cache: (num_blocks, block_size, n_kv_heads, d) + - value_cache: (num_blocks, block_size, n_kv_heads, d) + - block_tables: (num_active_blocks, ) + - mask: (seq_q, num_active_blocks * block_size) + - o: shape (1, n_heads, seq_q, d) + - l_m: shape (1, n_heads, seq_q, 2) + + - This kernel requires seq_k == seq_v + - We use continuous batching by default, so the batch dimension is + always 1, and different requests are concatenated along sequence + dimension. + - We use paged cache blocks (key_cache, value_cache) to store KV cache. + + IO tensor dtypes: + - This kernel assumes all IO tensors have the same dtype except for + block_tables (int32) and mask (int32) + - If mixed_percision is True, then all Tensor Engine operation will be + performed in bfloat16 and accumulation will be performed in float32. + Otherwise the intermediates will be in the same type as the inputs. + + Compile-time Constants: + - softmax_scale: scaling for softmax, is None, default is `1.0/(d**0.5)` + - mixed_precision: flag to set non-matmul ops in fp32 precision, default + is set to `true`, if false, we use same precision as input types + - config: Instance of dataclass :class:`nki.kernels.attention.FlashConfig` + with Performance config parameters for flash attention with default + values + seq_tile_size: `default=2048`, size of the kv tile size for attention + computation reduction + + GQA support Notes: + the spmd kernel for launching kernel should be on kv_heads instead of + nheads + + Example usage: + MHA: q: [b, h, d, s], k: [b, h, d, s], v: [b, h, s, d] + usage: `flash_fwd[b, h](q, k, v, ...)` + GQA: q: [b, h, d, s], k: [b, kv_h, d, s], v: [b, kv_h, s, d] + usage: `flash_fwd[b, kv_h](q, k, v, ...)` + """ + config = config or FlashConfig() + B_F_SIZE = 512 + B_P_SIZE = 128 + b, h, d, seqlen_q = query.shape + B_D_SIZE = d + LARGE_TILE_SZ = config.seq_tile_size + n_tile_q = seqlen_q // B_P_SIZE # since q will be loaded on tensor engine + num_blocks, block_size, k_h, _ = key_cache.shape + q_h_per_k_h = h // k_h + assert tuple(key_cache.shape) == ( + num_blocks, + block_size, + k_h, + d, + ), "Input shape mismatch!" + assert tuple(value_cache.shape) == ( + num_blocks, + block_size, + k_h, + d, + ), "Input shape mismatch!" + assert b == 1, f"invalid batch size {b=}" + assert d <= 128, f" we do not support head_dim > 128, got head dim {d}" + kernel_dtype = nl.bfloat16 if mixed_precision else query.dtype + acc_type = np.dtype(np.float32) if mixed_precision else kernel_dtype + + o = nl.ndarray((b, h, seqlen_q, d), + dtype=query.dtype, + buffer=nl.shared_hbm) + hbm_l_buffer, hbm_m_buffer, hbm_qk_res, qk_res_buffer = ( + None, + None, + None, + None, + ) + if return_debug_tensors: + hbm_l_buffer = nl.ndarray((b, h, seqlen_q), + dtype=acc_type, + buffer=nl.shared_hbm) + hbm_m_buffer = nl.ndarray((b, h, seqlen_q), + dtype=acc_type, + buffer=nl.shared_hbm) + hbm_qk_res = nl.ndarray((b, h, B_P_SIZE, seqlen_q), + dtype=acc_type, + buffer=nl.shared_hbm) + qk_res_buffer = nl.zeros( + (n_tile_q, q_h_per_k_h, par_dim(B_P_SIZE), seqlen_q), + dtype=acc_type, + buffer=nl.sbuf, + lazy_initialization=True, + ) + + assert ( + nl.program_ndim() == 2 + ), f"Expect spmd grid with 2 dimensions, got {nl.program_ndim()} instead!" + batch_id = nl.program_id(axis=0) + head_id = nl.program_id(axis=1) + + softmax_scale = softmax_scale or (1.0 / (d**0.5)) + + (num_active_blocks, ) = block_tables.shape + context_kv_len = num_active_blocks * block_size + assert (config.seq_tile_size >= 512 + ), f" seq tile_size {config.seq_tile_size} cannot be less than 512" + assert (context_kv_len % LARGE_TILE_SZ == 0 + ), f"Need {context_kv_len=} to be divisible by {LARGE_TILE_SZ=}" + assert ( + LARGE_TILE_SZ % B_P_SIZE == 0 + ), f"Need LARGE_TILE_SZ ({LARGE_TILE_SZ}) to be divisible by {B_P_SIZE=}" + assert (B_P_SIZE % block_size == 0 + ), f"Need B_P_SIZE ({B_P_SIZE}) to be divisible by {block_size=}" + num_large_k_tile = context_kv_len // LARGE_TILE_SZ + num_blocks_per_large_tile = LARGE_TILE_SZ // block_size + assert (num_blocks_per_large_tile <= B_P_SIZE + ), f"The number of blocks in each large tile " \ + f"({num_blocks_per_large_tile}) shouldn't exceed partition size {B_P_SIZE}" + + block_tables_sbuf = nl.full((par_dim(B_P_SIZE), num_large_k_tile), + 0, + dtype=np.int32, + buffer=nl.sbuf) + for j in nl.affine_range(num_large_k_tile): + i_p = nl.arange(num_blocks_per_large_tile)[:, None] + block_tables_sbuf[i_p, j] = nl.load( + block_tables[j * num_blocks_per_large_tile + i_p], dtype=np.int32) + + # Global Flash Attention accumulators + o_buffer = nl.zeros( + (n_tile_q, q_h_per_k_h, par_dim(B_P_SIZE), d), + dtype=acc_type, + buffer=nl.sbuf, + lazy_initialization=True, + ) + l_buffer = nl.zeros( + (par_dim(B_P_SIZE), n_tile_q, q_h_per_k_h), + dtype=acc_type, + buffer=nl.sbuf, + lazy_initialization=True, + ) + m_buffer = nl.zeros( + (n_tile_q, q_h_per_k_h, par_dim(B_P_SIZE), 1), + dtype=acc_type, + buffer=nl.sbuf, + lazy_initialization=True, + ) + + for j in nl.sequential_range(0, num_large_k_tile): + cur_k_tile = nl.ndarray((par_dim(B_D_SIZE), LARGE_TILE_SZ), + dtype=kernel_dtype) + cur_v_tile = nl.ndarray( + (LARGE_TILE_SZ // B_P_SIZE, par_dim(B_P_SIZE), B_D_SIZE), + dtype=kernel_dtype, + ) + + for k_i in nl.affine_range(num_blocks_per_large_tile): + loaded = nl.load(key_cache[block_tables_sbuf[k_i, j], :, + head_id, :]) + cur_k_tile[:, nl.ds(k_i * + block_size, block_size)] = nl.transpose(loaded) + + load_tile_size = B_P_SIZE + num_blocks_per_partition = load_tile_size // block_size + for partition_idx in nl.affine_range(LARGE_TILE_SZ // load_tile_size): + for block_in_partition in nl.affine_range( + num_blocks_per_partition): + v_i = (partition_idx * num_blocks_per_partition + + block_in_partition) + loaded_v = nl.load(value_cache[block_tables_sbuf[v_i, j], :, + head_id, :]) + cur_v_tile[partition_idx, + nl.ds(block_in_partition * + block_size, block_size), :, ] = loaded_v + + cur_mask = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ), + dtype=mask.dtype) + for m_i in nl.affine_range(LARGE_TILE_SZ // B_F_SIZE): + cur_mask[:, nl.ds(m_i * B_F_SIZE, B_F_SIZE)] = nl.load( + mask[:, nl.ds(j * LARGE_TILE_SZ + m_i * B_F_SIZE, B_F_SIZE)]) + + for i_q_h in nl.affine_range(q_h_per_k_h): + for i in nl.affine_range(n_tile_q): + q_tile = nl.ndarray((B_D_SIZE, B_P_SIZE), dtype=kernel_dtype) + q_hbm_tile = query[batch_id, head_id * q_h_per_k_h + i_q_h] + q_sbuf_tile = nl.load( + q_hbm_tile[:, nl.ds(i * B_P_SIZE, B_P_SIZE)], + dtype=kernel_dtype, + ) # load (d, 128) tile in SBUF + q_tile[:, :] = q_sbuf_tile * softmax_scale + + _flash_attention_core( + q_local_tile=q_tile, + k=cur_k_tile, + v=cur_v_tile, + q_h_per_k_h=q_h_per_k_h, + seqlen_q=seqlen_q, + nheads=h, + o_buffer=o_buffer[i, i_q_h], + l_buffer=l_buffer[:, i, i_q_h], + m_buffer=m_buffer[i, i_q_h], + batch_id=batch_id, + head_id=head_id, + gqa_head_idx=i_q_h, + q_tile_idx=i, + local_k_large_tile_idx=j, + kernel_dtype=kernel_dtype, + acc_type=acc_type, + flash_config=config, + use_causal_mask=False, + continuous_batching_mask=cur_mask, + initialize=j == 0, + B_P_SIZE=B_P_SIZE, + B_F_SIZE=B_F_SIZE, + B_D_SIZE=B_D_SIZE, + dropout_p=0.0, + dropout_p_tensor=None, + seed_tensor=None, + logit_bias_tile=None, + ) + + # compute attention between input query, key and value + if key is not None and value is not None: + B_F_SIZE = seqlen_q + LARGE_TILE_SZ = seqlen_q + active_config = FlashConfig( + seq_tile_size=LARGE_TILE_SZ, + should_transpose_v=config.should_transpose_v, + ) + + cur_k_tile = nl.ndarray((par_dim(B_D_SIZE), LARGE_TILE_SZ), + dtype=kernel_dtype) + cur_v_tile = nl.ndarray( + (LARGE_TILE_SZ // B_P_SIZE, par_dim(B_P_SIZE), B_D_SIZE), + dtype=kernel_dtype, + ) + + cur_k_tile[:, :] = nl.load(key[batch_id, head_id, :, :]) + + load_tile_size = B_P_SIZE + v_hbm_tile = value[batch_id, head_id] + for v_i in nl.affine_range(LARGE_TILE_SZ // load_tile_size): + load_v_tile( + v_hbm_tile=v_hbm_tile, + cur_v_tile=cur_v_tile, + j=0, + v_i=v_i, + config=active_config, + ) + + cur_mask = nl.ndarray((par_dim(B_P_SIZE), B_F_SIZE), dtype=mask.dtype) + cur_mask[:, :] = nl.load(mask[:, nl.ds(context_kv_len, B_F_SIZE)]) + + for i_q_h in nl.affine_range(q_h_per_k_h): + for i in nl.affine_range(n_tile_q): + q_tile = nl.ndarray((B_D_SIZE, B_P_SIZE), dtype=kernel_dtype) + q_hbm_tile = query[batch_id, head_id * q_h_per_k_h + i_q_h] + q_sbuf_tile = nl.load( + q_hbm_tile[:, nl.ds(i * B_P_SIZE, B_P_SIZE)], + dtype=kernel_dtype, + ) # load (d, 128) tile in SBUF + q_tile[:, :] = q_sbuf_tile * softmax_scale + _flash_attention_core( + q_local_tile=q_tile, + k=cur_k_tile, + v=cur_v_tile, + q_h_per_k_h=q_h_per_k_h, + seqlen_q=seqlen_q, + nheads=h, + o_buffer=o_buffer[i, i_q_h], + l_buffer=l_buffer[:, i, i_q_h], + m_buffer=m_buffer[i, i_q_h], + batch_id=batch_id, + head_id=head_id, + gqa_head_idx=i_q_h, + q_tile_idx=i, + local_k_large_tile_idx=0, + kernel_dtype=kernel_dtype, + acc_type=acc_type, + flash_config=active_config, + use_causal_mask=False, + continuous_batching_mask=cur_mask, + initialize=False, + B_P_SIZE=B_P_SIZE, + B_F_SIZE=B_F_SIZE, + B_D_SIZE=B_D_SIZE, + dropout_p=0.0, + dropout_p_tensor=None, + seed_tensor=None, + logit_bias_tile=None, + qk_res_buffer=qk_res_buffer[i, i_q_h] + if qk_res_buffer is not None else None, + ) + + # -- -- -- -- write output to buffer on HBM -- -- -- -- -- -- # + for i_q_h in nl.affine_range(q_h_per_k_h): + for i in nl.affine_range(n_tile_q): + out = nl.multiply( + o_buffer[i, i_q_h, :, :], + nl.exp(m_buffer[i, i_q_h, :, :] - l_buffer[:, i, i_q_h]), + dtype=kernel_dtype, + ) + + nl.store( + o[batch_id, head_id * q_h_per_k_h + i_q_h, + nl.ds(i * B_P_SIZE, B_P_SIZE), :, ], + out, + ) + # maximum and summation statistics + if return_debug_tensors: + nl.store( + hbm_m_buffer[batch_id, head_id * q_h_per_k_h + i_q_h, + nl.ds(i * B_P_SIZE, B_P_SIZE), ], + m_buffer[i, i_q_h, :, :], + ) + nl.store( + hbm_l_buffer[batch_id, head_id * q_h_per_k_h + i_q_h, + nl.ds(i * B_P_SIZE, B_P_SIZE), ], + l_buffer[:, i, i_q_h], + ) + nl.store( + hbm_qk_res[batch_id, head_id * q_h_per_k_h + i_q_h, :, :], + qk_res_buffer[batch_id, i_q_h, :, :], + ) + + if return_debug_tensors: + return o, hbm_m_buffer, hbm_l_buffer, hbm_qk_res + return o + + +def flash_attn_varlen_nkifunc( + query, + key, + value, + key_cache, + value_cache, + block_table, + attn_mask, + n_kv_head=None, + head_size=None, + B_P_SIZE=128, + LARGE_TILE_SZ=2048, + return_debug_tensors=False, + mixed_precision=True, +): + config = FlashConfig( + seq_tile_size=LARGE_TILE_SZ, + should_transpose_v=False, + ) + kwargs = dict( + query=query, + key=key, + value=value, + key_cache=key_cache, + value_cache=value_cache, + block_tables=block_table, + mask=attn_mask, + softmax_scale=1.0 / (head_size**0.5), + config=config, + mixed_precision=mixed_precision, + return_debug_tensors=return_debug_tensors, + ) + _, n_kv_head, _, _ = key.shape + + if return_debug_tensors: + o, *debug_tensors = flash_paged_attention[1, n_kv_head](**kwargs) + return o, *debug_tensors + else: + o = flash_paged_attention[1, n_kv_head](**kwargs) + return o From 426a5c362557c6df4604ed084660b8915fbca30c Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Mon, 27 Jan 2025 20:56:31 -0500 Subject: [PATCH 11/69] Fix bad path in prometheus example (#12481) Signed-off-by: mgoin --- examples/online_serving/prometheus_grafana/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/online_serving/prometheus_grafana/README.md b/examples/online_serving/prometheus_grafana/README.md index c49e5306a1cb4..4a85f953b0b4c 100644 --- a/examples/online_serving/prometheus_grafana/README.md +++ b/examples/online_serving/prometheus_grafana/README.md @@ -24,7 +24,7 @@ Submit some sample requests to the server: ```bash wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json -python3 ../../benchmarks/benchmark_serving.py \ +python3 ../../../benchmarks/benchmark_serving.py \ --model mistralai/Mistral-7B-v0.1 \ --tokenizer mistralai/Mistral-7B-v0.1 \ --endpoint /v1/completions \ From 23a7cbc88b5a17499766d1cbc0de283c9f980509 Mon Sep 17 00:00:00 2001 From: Hossein Sarshar Date: Mon, 27 Jan 2025 22:18:07 -0500 Subject: [PATCH 12/69] [CI/Build] Fixed the xla nightly issue report in #12451 (#12453) --- requirements-tpu.txt | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/requirements-tpu.txt b/requirements-tpu.txt index 51a0c65eac5aa..1abde714af7c9 100644 --- a/requirements-tpu.txt +++ b/requirements-tpu.txt @@ -10,17 +10,14 @@ wheel jinja2 ray[default] -# Install torch, torch_xla +# Install torch_xla +--pre +--extra-index-url https://download.pytorch.org/whl/nightly/cpu +--find-links https://storage.googleapis.com/libtpu-wheels/index.html --find-links https://storage.googleapis.com/libtpu-releases/index.html --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html -# Note: This torch whl can be slightly different from the official torch nightly whl -# since they are not built on the same commit (but on the same day). This difference may cause C++ undefined symbol issue -# if some change between the 2 commits introduce some C++ API change. -# Here we install the exact torch whl from which torch_xla is built from, to avoid potential C++ undefined symbol issue. -torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250124-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" -torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250124-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" -torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250124-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" -torch_xla[pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" -torch_xla[pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" -torch_xla[pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" +torch==2.6.0.dev20241216+cpu +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" From 0f465ab53303fbd3c8ad32163db161cdb0cf8dad Mon Sep 17 00:00:00 2001 From: Gabriel Marinho <104592062+gmarinho2@users.noreply.github.com> Date: Tue, 28 Jan 2025 00:30:13 -0300 Subject: [PATCH 13/69] [FEATURE] Enables offline /score for embedding models (#12021) Signed-off-by: Gabriel Marinho --- .../models/embedding/language/test_scoring.py | 100 +++++++++++ vllm/entrypoints/llm.py | 160 +++++++++++++----- 2 files changed, 216 insertions(+), 44 deletions(-) diff --git a/tests/models/embedding/language/test_scoring.py b/tests/models/embedding/language/test_scoring.py index be6e3842821e2..3db27d942ac8c 100644 --- a/tests/models/embedding/language/test_scoring.py +++ b/tests/models/embedding/language/test_scoring.py @@ -5,12 +5,18 @@ import math import pytest +import torch +import torch.nn.functional as F MODELS = [ "cross-encoder/ms-marco-MiniLM-L-6-v2", # Bert "BAAI/bge-reranker-v2-m3", # Roberta ] +EMBEDDING_MODELS = [ + "sentence-transformers/all-MiniLM-L12-v2", +] + TEXTS_1 = [ "What is the capital of France?", "What is the capital of Germany?", @@ -87,3 +93,97 @@ def test_llm_N_to_N(vllm_runner, hf_runner, model_name, dtype: str): assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) + + +@pytest.fixture(scope="module", params=EMBEDDING_MODELS) +def emb_model_name(request): + yield request.param + + +@pytest.mark.parametrize("dtype", ["half"]) +def test_llm_1_to_1_embedding(vllm_runner, hf_runner, emb_model_name, + dtype: str): + + text_pair = [TEXTS_1[0], TEXTS_2[0]] + + with hf_runner(emb_model_name, dtype=dtype, + is_sentence_transformer=True) as hf_model: + hf_embeddings = hf_model.encode(text_pair) + hf_outputs = [ + F.cosine_similarity(*map(torch.tensor, hf_embeddings), dim=0) + ] + + with vllm_runner(emb_model_name, + task="embed", + dtype=dtype, + max_model_len=None) as vllm_model: + vllm_outputs = vllm_model.score(text_pair[0], text_pair[1]) + + assert len(vllm_outputs) == 1 + assert len(hf_outputs) == 1 + + assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) + + +@pytest.mark.parametrize("dtype", ["half"]) +def test_llm_1_to_N_embedding(vllm_runner, hf_runner, emb_model_name, + dtype: str): + + text_pairs = [ + [TEXTS_1[0], TEXTS_2[0]], + [TEXTS_1[0], TEXTS_2[1]], + ] + + with hf_runner(emb_model_name, dtype=dtype, + is_sentence_transformer=True) as hf_model: + hf_embeddings = [ + hf_model.encode(text_pair) for text_pair in text_pairs + ] + hf_outputs = [ + F.cosine_similarity(*map(torch.tensor, pair), dim=0) + for pair in hf_embeddings + ] + + with vllm_runner(emb_model_name, + task="embed", + dtype=dtype, + max_model_len=None) as vllm_model: + vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2) + + assert len(vllm_outputs) == 2 + assert len(hf_outputs) == 2 + + assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) + assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) + + +@pytest.mark.parametrize("dtype", ["half"]) +def test_llm_N_to_N_embedding(vllm_runner, hf_runner, emb_model_name, + dtype: str): + + text_pairs = [ + [TEXTS_1[0], TEXTS_2[0]], + [TEXTS_1[1], TEXTS_2[1]], + ] + + with hf_runner(emb_model_name, dtype=dtype, + is_sentence_transformer=True) as hf_model: + hf_embeddings = [ + hf_model.encode(text_pair) for text_pair in text_pairs + ] + hf_outputs = [ + F.cosine_similarity(*map(torch.tensor, pair), dim=0) + for pair in hf_embeddings + ] + + with vllm_runner(emb_model_name, + task="embed", + dtype=dtype, + max_model_len=None) as vllm_model: + vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2) + + assert len(vllm_outputs) == 2 + assert len(hf_outputs) == 2 + + assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) + assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 1860ed3d7db5a..46b595b0da73c 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -5,6 +5,7 @@ Tuple, Type, Union, cast, overload) import cloudpickle +import torch import torch.nn as nn from tqdm import tqdm from typing_extensions import TypeVar, deprecated @@ -996,6 +997,107 @@ def classify( return [ClassificationRequestOutput.from_base(item) for item in items] + def _embedding_score( + self, + tokenizer: AnyTokenizer, + text_1: List[Union[str, TextPrompt, TokensPrompt]], + text_2: List[Union[str, TextPrompt, TokensPrompt]], + truncate_prompt_tokens: Optional[int] = None, + use_tqdm: bool = True, + lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + ) -> List[ScoringRequestOutput]: + + encoded_output = self.encode( + text_1 + text_2, + use_tqdm=use_tqdm, + lora_request=lora_request, + prompt_adapter_request=prompt_adapter_request) + encoded_output_1 = encoded_output[0:len(text_1)] + encoded_output_2 = encoded_output[len(text_1):] + + if len(encoded_output_1) == 1: + encoded_output_1 = encoded_output_1 * len(encoded_output_2) + + output_pairs = [(t1, t2) + for t1, t2 in zip(encoded_output_1, encoded_output_2)] + + scores = [] + scorer = torch.nn.CosineSimilarity(0) + + for embed_1, embed_2 in output_pairs: + pair_score = scorer(embed_1.outputs.data, embed_2.outputs.data) + + if (pad_token_id := getattr(tokenizer, "pad_token_id", + None)) is not None: + tokens = embed_1.prompt_token_ids + [ + pad_token_id + ] + embed_2.prompt_token_ids + else: + tokens = embed_1.prompt_token_ids + embed_2.prompt_token_ids + + scores.append( + PoolingRequestOutput( + request_id=f"{embed_1.request_id}_{embed_2.request_id}", + outputs=pair_score, + prompt_token_ids=tokens, + finished=True)) + + items = self.engine_class.validate_outputs(scores, + PoolingRequestOutput) + return [ScoringRequestOutput.from_base(item) for item in items] + + def _cross_encoding_score( + self, + tokenizer: Union[AnyTokenizer], + text_1: List[Union[str, TextPrompt, TokensPrompt]], + text_2: List[Union[str, TextPrompt, TokensPrompt]], + truncate_prompt_tokens: Optional[int] = None, + use_tqdm: bool = True, + lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + ) -> List[ScoringRequestOutput]: + + if isinstance(tokenizer, MistralTokenizer): + raise ValueError( + "Score API is only enabled for `--task embed or score`") + + if len(text_1) == 1: + text_1 = text_1 * len(text_2) + + input_pairs = [(t1, t2) for t1, t2 in zip(text_1, text_2)] + + pooling_params = PoolingParams() + + tokenization_kwargs: Dict[str, Any] = {} + if truncate_prompt_tokens is not None: + tokenization_kwargs["truncation"] = True + tokenization_kwargs["max_length"] = truncate_prompt_tokens + + parsed_prompts = [] + + for q, t in input_pairs: + prompt_inputs = tokenizer(text=q, + text_pair=t, + **tokenization_kwargs) + engine_prompt = TokensPrompt( + prompt_token_ids=prompt_inputs["input_ids"], + token_type_ids=prompt_inputs.get("token_type_ids")) + parsed_prompts.append(engine_prompt) + + self._validate_and_add_requests( + prompts=parsed_prompts, + params=pooling_params, + lora_request=lora_request, + prompt_adapter_request=prompt_adapter_request, + ) + + outputs = self._run_engine(use_tqdm=use_tqdm) + items = self.engine_class.validate_outputs(outputs, + PoolingRequestOutput) + + return [ScoringRequestOutput.from_base(item) for item in items] + def score( self, text_1: Union[SingletonPrompt, Sequence[SingletonPrompt]], @@ -1047,25 +1149,20 @@ def score( raise ValueError(" ".join(messages)) - if not self.llm_engine.model_config.is_cross_encoder: - raise ValueError("Your model does not support cross encoding") - if self.llm_engine.model_config.task != "score": - raise ValueError("Score API is only enabled for `--task score`") - - tokenizer = self.llm_engine.get_tokenizer() - - if isinstance(tokenizer, MistralTokenizer): + if self.llm_engine.model_config.task not in ("embed", "score"): raise ValueError( - "MistralTokenizer not supported for cross-encoding") + "Score API is only enabled for `--task embed or --task score`") # the tokenizer for models such as # "cross-encoder/ms-marco-MiniLM-L-6-v2" doesn't support passing # lists of tokens to the `text` and `text_pair` kwargs + tokenizer = self.llm_engine.get_tokenizer() + def ensure_str(prompt: SingletonPrompt): if isinstance(prompt, dict): if "multi_modal_data" in prompt: raise ValueError("Multi-modal prompt is not " - "supported for cross encoding") + "supported for scoring") elif "prompt_token_ids" in prompt: prompt = tokenizer.decode( cast(TokensPrompt, prompt)["prompt_token_ids"]) @@ -1091,40 +1188,15 @@ def ensure_str(prompt: SingletonPrompt): if len(text_2) == 0: raise ValueError("At least one text_pair element must be given") - if len(text_1) == 1: - text_1 = text_1 * len(text_2) - - input_pairs = [(t1, t2) for t1, t2 in zip(text_1, text_2)] - pooling_params = PoolingParams() - - tokenization_kwargs: Dict[str, Any] = {} - if truncate_prompt_tokens is not None: - tokenization_kwargs["truncation"] = True - tokenization_kwargs["max_length"] = truncate_prompt_tokens - - parsed_prompts = [] - - for q, t in input_pairs: - prompt_inputs = tokenizer(text=q, - text_pair=t, - **tokenization_kwargs) - engine_prompt = TokensPrompt( - prompt_token_ids=prompt_inputs["input_ids"], - token_type_ids=prompt_inputs.get("token_type_ids")) - parsed_prompts.append(engine_prompt) - - self._validate_and_add_requests( - prompts=parsed_prompts, - params=pooling_params, - lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request, - ) - - outputs = self._run_engine(use_tqdm=use_tqdm) - items = self.engine_class.validate_outputs(outputs, - PoolingRequestOutput) - - return [ScoringRequestOutput.from_base(item) for item in items] + if self.llm_engine.model_config.is_cross_encoder: + return self._cross_encoding_score(tokenizer, text_1, text_2, + truncate_prompt_tokens, use_tqdm, + lora_request, + prompt_adapter_request) + else: + return self._embedding_score(tokenizer, text_1, text_2, + truncate_prompt_tokens, use_tqdm, + lora_request, prompt_adapter_request) def start_profile(self) -> None: self.llm_engine.start_profile() From dd66fd2b01e1195b7ccc8ffcd4b5d49ff1946a56 Mon Sep 17 00:00:00 2001 From: Mengqing Cao Date: Tue, 28 Jan 2025 14:11:05 +0800 Subject: [PATCH 14/69] [CI] fix pre-commit error (#12494) Signed-off-by: Mengqing Cao --- vllm/attention/ops/nki_flash_attn.py | 37 +++++++++++++++++--------- vllm/spec_decode/spec_decode_worker.py | 8 +++--- 2 files changed, 29 insertions(+), 16 deletions(-) diff --git a/vllm/attention/ops/nki_flash_attn.py b/vllm/attention/ops/nki_flash_attn.py index b9765b0f0283d..9de4ef7f5a140 100644 --- a/vllm/attention/ops/nki_flash_attn.py +++ b/vllm/attention/ops/nki_flash_attn.py @@ -106,11 +106,12 @@ def _flash_attention_core( assert (continuous_batching_mask is not None), "continuous_batching_mask input is required." if continuous_batching_mask is not None: - assert (logit_bias_tile is - None), "continuous_batching_mask does not support logit_bias!" + assert ( + logit_bias_tile + is None), "continuous_batching_mask does not support logit_bias!" # mask are used to only apply computation to the lower half of the matrix, - # which reduce the arthimetic intensity by half + # which reduce the arithmetic intensity by half forward_mask = (q_tile_idx * B_P_SIZE >= local_k_large_tile_idx * LARGE_TILE_SZ if use_causal_mask else None) @@ -468,9 +469,11 @@ def flash_paged_attention( block_in_partition) loaded_v = nl.load(value_cache[block_tables_sbuf[v_i, j], :, head_id, :]) - cur_v_tile[partition_idx, - nl.ds(block_in_partition * - block_size, block_size), :, ] = loaded_v + cur_v_tile[ + partition_idx, + nl.ds(block_in_partition * block_size, block_size), + :, + ] = loaded_v cur_mask = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ), dtype=mask.dtype) @@ -601,20 +604,30 @@ def flash_paged_attention( ) nl.store( - o[batch_id, head_id * q_h_per_k_h + i_q_h, - nl.ds(i * B_P_SIZE, B_P_SIZE), :, ], + o[ + batch_id, + head_id * q_h_per_k_h + i_q_h, + nl.ds(i * B_P_SIZE, B_P_SIZE), + :, + ], out, ) # maximum and summation statistics if return_debug_tensors: nl.store( - hbm_m_buffer[batch_id, head_id * q_h_per_k_h + i_q_h, - nl.ds(i * B_P_SIZE, B_P_SIZE), ], + hbm_m_buffer[ + batch_id, + head_id * q_h_per_k_h + i_q_h, + nl.ds(i * B_P_SIZE, B_P_SIZE), + ], m_buffer[i, i_q_h, :, :], ) nl.store( - hbm_l_buffer[batch_id, head_id * q_h_per_k_h + i_q_h, - nl.ds(i * B_P_SIZE, B_P_SIZE), ], + hbm_l_buffer[ + batch_id, + head_id * q_h_per_k_h + i_q_h, + nl.ds(i * B_P_SIZE, B_P_SIZE), + ], l_buffer[:, i, i_q_h], ) nl.store( diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index af1c4dfcebbc0..8d6d05cbaea75 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -870,10 +870,10 @@ def _verify_tokens( accepted_index = accepted_token_ids + 1 # Convert -1 to 0 accepted_index = accepted_index.count_nonzero(dim=1).add_(-1) # b # Drop non-terminal prefill chunks hidden states. - hidden_states = hidden_states[ - accepted_index != VLLM_INVALID_TOKEN_ID] - accepted_index = accepted_index[ - accepted_index != VLLM_INVALID_TOKEN_ID] + hidden_states = hidden_states[accepted_index != + VLLM_INVALID_TOKEN_ID] + accepted_index = accepted_index[accepted_index != + VLLM_INVALID_TOKEN_ID] assert len(accepted_index) == hidden_states.shape[0] == len( terminal_metadata) index = accepted_index[:, None, None].expand(-1, 1, From 8cbc4249758d399c0606ef4a1241e01176d0160b Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Tue, 28 Jan 2025 00:22:41 -0800 Subject: [PATCH 15/69] Update README.md with V1 alpha release (#12495) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 4ed905bf7aa9d..5fd30f2b1b9d7 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,7 @@ Easy, fast, and cheap LLM serving for everyone --- *Latest News* 🔥 +- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html). - [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing). - [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone! - [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing). From e29d4358ef054163b80dfb7e53ce3eb0e08d1328 Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Date: Tue, 28 Jan 2025 03:27:41 -0500 Subject: [PATCH 16/69] [V1] Include Engine Version in Logs (#12496) Signed-off-by: rshaw@neuralmagic.com --- vllm/engine/llm_engine.py | 2 +- vllm/v1/engine/core.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index ab67ae29723cd..dd677300fc66a 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -230,7 +230,7 @@ def __init__( ) logger.info( - "Initializing an LLM engine (v%s) with config: %s, " + "Initializing a V0 LLM engine (v%s) with config: %s, " "use_cached_outputs=%s, ", VLLM_VERSION, vllm_config, diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index cf94033a38d96..f50303bda58fd 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -42,7 +42,7 @@ def __init__( ): assert vllm_config.model_config.runner_type != "pooling" - logger.info("Initializing an LLM engine (v%s) with config: %s", + logger.info("Initializing a V1 LLM engine (v%s) with config: %s", VLLM_VERSION, vllm_config) # Setup Model. From 2079e43beecc486a607c9d79ab691e0e4563aa11 Mon Sep 17 00:00:00 2001 From: Sebastian Schoennenbeck Date: Tue, 28 Jan 2025 11:56:45 +0100 Subject: [PATCH 17/69] [Core] Make raw_request optional in ServingCompletion (#12503) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Sebastian Schönnenbeck --- vllm/entrypoints/openai/serving_completion.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index b0179f78bd635..13c3926368890 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -58,7 +58,7 @@ def __init__( async def create_completion( self, request: CompletionRequest, - raw_request: Request, + raw_request: Optional[Request] = None, ) -> Union[AsyncGenerator[str, None], CompletionResponse, ErrorResponse]: """Completion API similar to OpenAI's API. @@ -137,7 +137,7 @@ async def create_completion( lora_request=lora_request, prompt_adapter_request=prompt_adapter_request) - trace_headers = (await + trace_headers = (None if raw_request is None else await self._get_trace_headers(raw_request.headers)) if isinstance(sampling_params, BeamSearchParams): From 8f58a5135874770ac8429f4772d7f92fe33094e5 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 29 Jan 2025 00:25:05 +0800 Subject: [PATCH 18/69] [VLM] Merged multi-modal processor and V1 support for Qwen-VL (#12504) Signed-off-by: DarkLight1337 --- docs/source/models/supported_models.md | 2 +- .../multimodal/processing/test_common.py | 64 +- .../models/multimodal/processing/test_qwen.py | 144 ---- vllm/model_executor/models/qwen.py | 654 ++++++++++-------- 4 files changed, 387 insertions(+), 477 deletions(-) delete mode 100644 tests/models/multimodal/processing/test_qwen.py diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 8cdc663a0320f..e59150cdd3b83 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -745,7 +745,7 @@ See [this page](#generative-models) for more information on how to use generativ - `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. - ✅︎ - ✅︎ - - + - ✅︎ * - `Qwen2AudioForConditionalGeneration` - Qwen2-Audio - T + A+ diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index fe5b733c750a8..b575ec6acbef3 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -16,7 +16,6 @@ def _test_processing_correctness( model_id: str, - modalities: dict[str, bool], hit_rate: float, num_batches: int, simplify_rate: float, @@ -25,11 +24,6 @@ def _test_processing_correctness( model_info.check_available_online(on_fail="skip") model_info.check_transformers_version(on_fail="skip") - limit_mm_per_prompt = { - modality: 3 if supports_multi else 1 - for modality, supports_multi in modalities.items() - } - model_config = ModelConfig( model_id, task="auto", @@ -40,18 +34,29 @@ def _test_processing_correctness( dtype="float16", revision=None, hf_overrides=model_info.hf_overrides, - limit_mm_per_prompt=limit_mm_per_prompt, ) model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) factories = MULTIMODAL_REGISTRY._processor_factories[model_cls] ctx = InputProcessingContext( model_config, - tokenizer=cached_get_tokenizer(model_config.tokenizer), + tokenizer=cached_get_tokenizer( + model_config.tokenizer, + trust_remote_code=model_info.trust_remote_code, + ), ) # Ensure that it can fit all of the data cache = ProcessingCache(capacity=1 << 30) + processing_info = factories.info(ctx) + supported_mm_limits = processing_info.get_supported_mm_limits() + limit_mm_per_prompt = { + modality: 3 if limit is None else limit + for modality, limit in supported_mm_limits.items() + } + + model_config.get_multimodal_config().limit_per_prompt = limit_mm_per_prompt + baseline_processor = factories.build_processor(ctx, cache=None) cached_processor = factories.build_processor(ctx, cache=cache) dummy_inputs = baseline_processor.dummy_inputs @@ -82,8 +87,8 @@ def _test_processing_correctness( mm_data = { k: [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]()) - for _ in range(rng.randint(limit_mm_per_prompt[k]))] - for k in modalities + for _ in range(rng.randint(limit))] + for k, limit in limit_mm_per_prompt.items() } mm_counts = {k: len(vs) for k, vs in mm_data.items()} @@ -135,21 +140,22 @@ def _test_processing_correctness( # yapf: disable # True if the model supports multiple data items of the modality per request -@pytest.mark.parametrize(("model_id", "modalities"), [ - ("rhymes-ai/Aria", {"image": True}), - ("Salesforce/blip2-opt-2.7b", {"image": False}), - ("facebook/chameleon-7b", {"image": False}), - ("deepseek-ai/deepseek-vl2-tiny", {"image": True}), - ("adept/fuyu-8b", {"image": False}), - ("llava-hf/llava-1.5-7b-hf", {"image": True}), - ("llava-hf/llava-v1.6-mistral-7b-hf", {"image": True}), - ("llava-hf/LLaVA-NeXT-Video-7B-hf", {"video": False}), - ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", {"image": True, "video": True}), # noqa: E501 - ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}), - ("mistral-community/pixtral-12b", {"image": True}), - ("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}), - ("Qwen/Qwen2-Audio-7B-Instruct", {"audio": True}), - ("fixie-ai/ultravox-v0_3", {"audio": True}), +@pytest.mark.parametrize("model_id", [ + "rhymes-ai/Aria", + "Salesforce/blip2-opt-2.7b", + "facebook/chameleon-7b", + "deepseek-ai/deepseek-vl2-tiny", + "adept/fuyu-8b", + "llava-hf/llava-1.5-7b-hf", + "llava-hf/llava-v1.6-mistral-7b-hf", + "llava-hf/LLaVA-NeXT-Video-7B-hf", + "llava-hf/llava-onevision-qwen2-0.5b-ov-hf", + "TIGER-Lab/Mantis-8B-siglip-llama3", + "mistral-community/pixtral-12b", + "Qwen/Qwen-VL-Chat", + "Qwen/Qwen2-VL-2B-Instruct", + "Qwen/Qwen2-Audio-7B-Instruct", + "fixie-ai/ultravox-v0_3", ]) @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) @pytest.mark.parametrize("num_batches", [32]) @@ -157,14 +163,12 @@ def _test_processing_correctness( # yapf: enable def test_processing_correctness( model_id: str, - modalities: dict[str, bool], hit_rate: float, num_batches: int, simplify_rate: float, ): _test_processing_correctness( model_id, - modalities, hit_rate=hit_rate, num_batches=num_batches, simplify_rate=simplify_rate, @@ -172,16 +176,13 @@ def test_processing_correctness( # yapf: disable -@pytest.mark.parametrize(("model_id", "modalities"), [ - ("microsoft/Phi-3-vision-128k-instruct", {"image": True}), -]) +@pytest.mark.parametrize("model_id", ["microsoft/Phi-3-vision-128k-instruct"]) @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) @pytest.mark.parametrize("num_batches", [32]) @pytest.mark.parametrize("simplify_rate", [1.0]) # yapf: enable def test_processing_correctness_phi3v( model_id: str, - modalities: dict[str, bool], hit_rate: float, num_batches: int, simplify_rate: float, @@ -195,7 +196,6 @@ def test_processing_correctness_phi3v( _test_processing_correctness( model_id, - modalities, hit_rate=hit_rate, num_batches=num_batches, simplify_rate=simplify_rate, diff --git a/tests/models/multimodal/processing/test_qwen.py b/tests/models/multimodal/processing/test_qwen.py deleted file mode 100644 index af0ace711ba3e..0000000000000 --- a/tests/models/multimodal/processing/test_qwen.py +++ /dev/null @@ -1,144 +0,0 @@ -"""Tests for Qwen's multimodal preprocessing kwargs.""" -from typing import Dict, List, Union - -import pytest -import torch -from PIL.Image import Image - -from vllm.inputs import InputContext, token_inputs -from vllm.multimodal import MultiModalKwargs -from vllm.multimodal.utils import cached_get_tokenizer - -from ....conftest import IMAGE_ASSETS -from ...utils import build_model_context - -### Multimodal preprocessing tests -SAMPLE_IMAGE = IMAGE_ASSETS[0].pil_image -# These values are specific to Qwen-VL/Chat; we can get these from the model -# config also, but they are hardcoded here to keep the parameterize/fixtures -# easy to read. -IMG_START_ID = 151857 -IMG_END_ID = 151858 -IMG_PAD_ID = 151859 -TOKS_PER_IMG = 256 -VIS_ENC_DIM = 4096 -IMG_SIZE = 448 - - -@pytest.fixture() -def input_mapper_for_qwen(): - # Lazy import to avoid initializing CUDA during test collection - from vllm.model_executor.models.qwen import input_mapper_for_qwen - return input_mapper_for_qwen - - -@pytest.fixture() -def input_processor_for_qwen(): - # Lazy import to avoid initializing CUDA during test collection - from vllm.model_executor.models.qwen import input_processor_for_qwen - return input_processor_for_qwen - - -@pytest.fixture() -def qwen_vl_context() -> InputContext: - """Get an InputContext for Qwen-VL.""" - return build_model_context(model_name="Qwen/Qwen-VL", - trust_remote_code=True) - - -# Happy path tests for single/multi-image scenarios for the multimodal -# input processor and mapper, respectively -@pytest.mark.parametrize("num_images", [1, 2]) -def test_input_processor_valid_mm_data(input_processor_for_qwen, - qwen_vl_context: InputContext, - num_images: int): - """Happy cases for image inputs to Qwen's multimodal input processor.""" - prompt = "".join( - [f"Picture {num}: \n" for num in range(1, num_images + 1)]) - inputs = token_inputs( - prompt=prompt, - # When processing multimodal data for a multimodal model, the qwen - # input processor will overwrite the provided prompt_token_ids with - # the image prompts - prompt_token_ids=[], - multi_modal_data={"image": torch.rand(num_images, TOKS_PER_IMG, 4096)}, - ) - proc_inputs = input_processor_for_qwen(qwen_vl_context, inputs) - assert isinstance(proc_inputs, dict) - - # Each image should have one start / stop and a fixed context of 256 - proc_tokens = proc_inputs["prompt_token_ids"] - assert proc_tokens.count(IMG_START_ID) == num_images - assert proc_tokens.count(IMG_END_ID) == num_images - assert proc_tokens.count(IMG_PAD_ID) == num_images * TOKS_PER_IMG - - -@pytest.mark.parametrize( - "img_data,expected_shape", - [ - # single / multi-image - (SAMPLE_IMAGE, (1, 3, IMG_SIZE, IMG_SIZE)), - (2 * [SAMPLE_IMAGE], (2, 3, IMG_SIZE, IMG_SIZE)), - # single / multi-image embeddings - (torch.rand( - (TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)), - (torch.rand( - (1, TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)), - (torch.rand( - (2, TOKS_PER_IMG, VIS_ENC_DIM)), (2, TOKS_PER_IMG, VIS_ENC_DIM)), - ]) -def test_input_mapper_valid_mm_data(input_mapper_for_qwen, - qwen_vl_context: InputContext, - img_data: Union[torch.Tensor, List[Image], - Image], - expected_shape: List[int]): - """Happy cases for image inputs to Qwen's multimodal input mapper.""" - mapped_img_data = input_mapper_for_qwen(qwen_vl_context, img_data) - # Ensure that we get the appropriately shaped pixel_values - # for images and image embeddings, respectively. - assert isinstance(mapped_img_data, MultiModalKwargs) - assert "pixel_values" in mapped_img_data - assert mapped_img_data["pixel_values"].shape == expected_shape - - -# Sad path tests for the multimodal input processor and mapper, respectively -@pytest.mark.parametrize("mm_data", [ - { - "image": torch.rand(5) - }, - { - "image": torch.rand((5, 5, 5, 5, 5)) - }, -]) -def test_input_processor_invalid_mm_data(input_processor_for_qwen, - qwen_vl_context: InputContext, - mm_data: Dict[str, torch.Tensor]): - """Test sad cases validated in Qwen's multimodal input processor.""" - tokenizer = cached_get_tokenizer(qwen_vl_context.model_config.tokenizer, - trust_remote_code=True) - prompt = "Picture 1: \n" - prompt_token_ids = tokenizer.encode(prompt) - inputs = token_inputs(prompt=prompt, - prompt_token_ids=prompt_token_ids, - multi_modal_data=mm_data) - # Should fail since we have too many or too few dimensions for embeddings - with pytest.raises(ValueError): - input_processor_for_qwen(qwen_vl_context, inputs) - - -@pytest.mark.parametrize( - "img_data", - [ - # Wrong context length - torch.rand((1, TOKS_PER_IMG + 10, VIS_ENC_DIM)), - # Wrong visual encoder output size - torch.rand((1, TOKS_PER_IMG, VIS_ENC_DIM + 10)), - ]) -def test_input_mapper_invalid_mm_data( - input_mapper_for_qwen, - qwen_vl_context: InputContext, - img_data: Union[torch.Tensor, List[Image], Image], -): - """Sad cases validated in Qwen VL's multimodal input mapper.""" - with pytest.raises(ValueError): - input_mapper_for_qwen(qwen_vl_context, img_data) diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 1345b381f0a99..86a9d3089c3ee 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -4,26 +4,28 @@ # LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE """Inference-only QWen model compatible with HuggingFace weights.""" +import copy import math import re -from functools import partial -from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping, - Optional, Set, Tuple, TypedDict, Union) +import unicodedata +from functools import lru_cache, partial +from typing import (AbstractSet, Any, Callable, Collection, Dict, Iterable, + List, Literal, Mapping, Optional, Set, Tuple, TypedDict, + Union) -import numpy as np import torch -from PIL import Image from torch import nn from torchvision import transforms from torchvision.transforms import InterpolationMode -from transformers import PretrainedConfig +from transformers import (BatchFeature, PretrainedConfig, PreTrainedTokenizer, + TensorType) +from transformers.image_utils import ImageInput +from transformers.tokenization_utils_base import TextInput from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, - InputContext, token_inputs) from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn from vllm.model_executor.layers.layernorm import RMSNorm @@ -42,15 +44,20 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs -from vllm.multimodal.utils import cached_get_tokenizer -from vllm.sequence import IntermediateTensors, SequenceData -from vllm.utils import is_list_of +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, + NestedTensors) +from vllm.multimodal.parse import MultiModalDataItems +from vllm.multimodal.processing import (BaseMultiModalProcessor, + BaseProcessingInfo, PromptReplacement, + PromptReplacementDetails) +from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs +from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP from .utils import (flatten_bn, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, - maybe_prefix) + maybe_prefix, merge_multimodal_embeddings) logger = init_logger(__name__) @@ -353,8 +360,10 @@ def __init__(self, self.ln_post = norm_layer(output_dim) self.proj = nn.Parameter( (output_dim**-0.5) * torch.randn(output_dim, output_dim)) + self.image_start_id = image_start_id self.image_end_id = image_start_id + 1 + self.image_pad_id = image_start_id + 2 def forward(self, x: torch.Tensor) -> torch.Tensor: x = x.to( @@ -383,21 +392,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: return x - def get_image_positions(self, - input_ids: torch.Tensor) -> Optional[torch.Tensor]: - """Given the input IDs, extracts start/stop points corresponding to - images. - - args: - Returns: - Optional torch tensor corresponding to start/stop pairs of images. - """ - if torch.any(input_ids == self.image_start_id): - bos_pos = torch.where(input_ids == self.image_start_id) - eos_pos = torch.where(input_ids == self.image_end_id) - return torch.stack((bos_pos[0], eos_pos[0]), dim=1) - return None - class QWenMLP(nn.Module): """MLP for the language component of the Qwen model, which contains a @@ -579,9 +573,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( make_empty_intermediate_tensors_factory( ["hidden_states", "residual"], config.hidden_size)) - self.visual = VisionTransformer(**config.visual, - quant_config=quant_config) if hasattr( - config, "visual") else None + + if (vision_config := getattr(config, "visual", None)): + self.visual = VisionTransformer(**vision_config, + quant_config=quant_config) + else: + self.visual = None def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.wte(input_ids) @@ -593,38 +590,13 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors], - pixel_values: Optional[QwenImageInputs], inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: - img_pos = None - # If pixel / visual embeddings are provided, this is a visual model - if pixel_values is not None and self.visual is not None: - if pixel_values["type"] != "image_embeds": - image_embeds = self.visual(pixel_values["data"]) - else: - image_embeds = pixel_values["data"] - - # features should be of shape (# images, 256, hidden_dim) - img_pos = self.visual.get_image_positions(input_ids) - if isinstance( - img_pos, - np.ndarray) and img_pos.shape[0] != image_embeds.shape[0]: - raise ValueError( - f"Number of placeholders: {img_pos.shape[0]} " - f"does not match number of images {image_embeds.shape[0]}." - ) - if get_pp_group().is_first_rank: if inputs_embeds is not None: hidden_states = inputs_embeds else: hidden_states = self.get_input_embeddings(input_ids) - hidden_states = self.wte(input_ids) - # Merge the image embeddings into the hidden states if actually have - # visual features and the corresponding image tokens - if img_pos is not None: - for idx, (img_bos, img_eos) in enumerate(img_pos): - hidden_states[img_bos + 1:img_eos] = image_embeds[idx] residual = None else: assert intermediate_tensors is not None @@ -648,159 +620,9 @@ def forward( return hidden_states -def get_image_text(image_num: int, padding: bool) -> str: - """Retrieves a placeholder text that when tokenized, will be expanded with - image pads. - - Args: - image_num: The number of the image that we want a text prompt for. - Images should be indexed starting at 1. - padding: Whether or not padding should be manually added. - - Returns: - Text placeholder prompt for the image being considered. - """ - image_start = f"Picture {image_num}: {IMG_START}" - image_end = f"{IMG_END}\n" - if not padding: - return f"{image_start}{image_end}" - return f"{image_start}{MAX_QWEN_IMG_TOKENS * IMG_PAD}{image_end}" - - -def input_processor_for_qwen(ctx: InputContext, - inputs: DecoderOnlyInputs) -> DecoderOnlyInputs: - """Processes the inputs, which may or may not be multimodal. - Multimodal inputs will only be processed if the model has a "visual" - component in its model config, otherwise they'll be ignored. - - Args: - ctx: Context of the loaded model. - inputs: LLM inputs which may have a multi_modal_data attribute. - - Returns: - If the model is language only or not multimodal inputs were provided, - returns inputs unmodified. Otherwise, processes the multimodal - images / image embeddings and adds the fixed-length image placeholders. - """ - multi_modal_data = inputs.get("multi_modal_data") - - # Only process images if we have multimodal data and a visual config - hf_config = ctx.get_hf_config() - if (multi_modal_data is None or "image" not in multi_modal_data - or not hasattr(hf_config, "visual")): - return inputs - - prompt = inputs.get("prompt") - prompt_token_ids = inputs["prompt_token_ids"] - model_config = ctx.model_config - tokenizer = cached_get_tokenizer( - model_config.tokenizer, - trust_remote_code=model_config.trust_remote_code) - image_data = multi_modal_data["image"] - if isinstance(image_data, torch.Tensor): - num_dims = len(image_data.shape) - if num_dims < 2 or num_dims > 3: - raise ValueError( - f"Expected img embeds to be have 3 dimensions, got {num_dims}") - num_images = 1 if num_dims == 2 else image_data.shape[0] - elif isinstance(image_data, Image.Image): - num_images = 1 - elif is_list_of(image_data, Image.Image): - num_images = len(image_data) - else: - raise TypeError(f"Invalid image type: {type(image_data)}") - - if prompt is None: - prompt = tokenizer.decode(prompt_token_ids) - - # Drops anything between / tags; encoding with the tokenizer - # will automatically add the image pads for the context. - new_prompt, num_matched_images = re.subn( - r"(Picture \d*: ).*?(<\/img>\n)", - r"\1\2", - prompt, - ) - - if num_matched_images != num_images: - logger.warning( - "Number of matched image placeholders %s doesn't match the number " - "of expected images %s; check your placeholder formatting.", - num_matched_images, num_images) - - new_prompt_token_ids = tokenizer.encode(new_prompt) - - return token_inputs(prompt=new_prompt, - prompt_token_ids=new_prompt_token_ids, - multi_modal_data=multi_modal_data) - - -def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalKwargs: - """Maps the input data to its MultiModalKwargs (if any). - - Args: - ctx: Context of the loaded model. - data: data potentially containing image/image embeddings to be mapped - to pixel_values in .forward() for a visual QWenLMHeadModel model. - - Returns: - MultiModalKwargs containing the stacked normalized images tensor or - image embeddings. - """ - # Early exit if we have provided an image to a language only Qwen model - hf_config = ctx.get_hf_config() - if not hasattr(hf_config, "visual"): - logger.warning( - "Images were provided but this model has no visual config; " - "multimodal inputs will not be forwarded to the model.") - return MultiModalKwargs() - - model_config = ctx.model_config - tokenizer = cached_get_tokenizer( - model_config.tokenizer, - trust_remote_code=model_config.trust_remote_code) - - image_pair_tok = tokenizer.encode(IMG_START + IMG_END, - add_special_tokens=False, - return_tensors="pt").squeeze() - image_start_id = image_pair_tok[0] - image_end_id = image_pair_tok[-1] - if (image_start_id + 1) != image_end_id: - raise ValueError( - f"Found image end ID {image_end_id}, but expected {IMG_START} + 1") - if len(image_pair_tok) != (MAX_QWEN_IMG_TOKENS + 2): - raise ValueError( - f"Expected image context length of {MAX_QWEN_IMG_TOKENS}, " - f"but got {image_pair_tok - 2}") - - hf_config = ctx.get_hf_config() - image_size = hf_config.visual["image_size"] - img_emb_size = hf_config.visual["output_dim"] - - if isinstance(data, torch.Tensor): - # It's expected that our values have already been processed - # by the visual transformer; shape is expected to be: - # (# images, 256, hidden_size) - if len(data.shape) == 2: - # Assume only one image embed was provided; unsqueeze the extra dim - data = data.unsqueeze(0) - if len(data.shape) != 3 or data.shape[ - 1] != MAX_QWEN_IMG_TOKENS or data.shape[2] != img_emb_size: - raise ValueError( - "Expected image embeds to be a tensor of shape" - f"[# images, {MAX_QWEN_IMG_TOKENS}, {img_emb_size}], but " - f"received shape [{data.shape}]") - pixel_values = data - else: - transform = build_normalization_transform(image_size) - if not isinstance(data, (list, tuple)): - data = [data] - transformed_images = [transform(datum) for datum in data] - pixel_values = torch.stack(transformed_images, dim=0) - return MultiModalKwargs({"pixel_values": pixel_values}) - - def build_normalization_transform(image_size: int) -> transforms.Compose: - """Builds a normalization transform which can be applied to one or + """ + Build a normalization transform which can be applied to one or more input images from which we want to extract visual features. Args: @@ -817,62 +639,251 @@ def build_normalization_transform(image_size: int) -> transforms.Compose: ]) -def dummy_data_for_qwen( - ctx: InputContext, - seq_len: int, - mm_counts: Mapping[str, int], -) -> DummyData: - """Build dummy data for warming up Qwen models; this will only contain text - matching the defaults for VLLM unless the model has a visual config. +@lru_cache(maxsize=1) +def _get_tokenizer_without_image_pad( + tokenizer: PreTrainedTokenizer) -> PreTrainedTokenizer: + """ + The logic of adding image pad tokens should only be applied in + :class:`QWenVLProcessor`, so they are patched out here. - Args: - ctx: Context of the loaded model. - seq_len: Number of tokens in the text sequence. - mm_counts: multimodal data counts. - - Returns: - Tuple containing sequential and multimodal data. + The definition of the wrapped tokenizer can be found here: + https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py + """ + new_tokenizer = copy.deepcopy(tokenizer) + + class TokenizerWithoutImagePad(tokenizer.__class__): # type: ignore + + def tokenize( + self, + text: str, + allowed_special: Union[AbstractSet[str], str] = "all", + disallowed_special: Union[Collection[str], str] = (), + **kwargs, + ) -> list[Union[bytes, str]]: + text = unicodedata.normalize("NFC", text) + + return [ + self.decoder[t] for t in self.tokenizer.encode( + text, + allowed_special=allowed_special, + disallowed_special=disallowed_special, + ) + ] + + def _decode( + self, + token_ids: Union[int, List[int]], + skip_special_tokens: bool = False, + errors: Optional[str] = None, + **kwargs, + ) -> str: + if isinstance(token_ids, int): + token_ids = [token_ids] + + return self.tokenizer.decode( + token_ids, + errors=errors or self.errors, + ) + + TokenizerWithoutImagePad.__name__ = \ + f"{tokenizer.__class__.__name__}WithoutImagePad" + + new_tokenizer.__class__ = TokenizerWithoutImagePad + return new_tokenizer + + +class QWenVLProcessor: + """ + This model doesn't define its own HF processor, + so we implement our own one here. + + We call the wrapped tokenizer to automatically insert image pad tokens: + https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py#L245 + + The image processor is defined here: + https://huggingface.co/Qwen/Qwen-VL/blob/main/visual.py#L354 """ - hf_config = ctx.get_hf_config() - - # The presence of a visual config indicates this is a multimodal model. - # If we don't have it, the model is considered an LLM for warmup purposes. - if not hasattr(hf_config, "visual"): - seq_data = SequenceData.from_prompt_token_counts((0, seq_len)) - mm_data = None - return DummyData(seq_data, mm_data) - - # We have a visual component - use images to warm up - num_images = mm_counts["image"] - model_config = ctx.model_config - tokenizer = cached_get_tokenizer( - model_config.tokenizer, - trust_remote_code=model_config.trust_remote_code) - - # Build the image prompts with no imgpads; the tokenizer will add img pads - image_prompt = ''.join( - [get_image_text(idx, False) for idx in range(1, num_images + 1)]) - toks = tokenizer.encode(image_prompt, add_special_tokens=False) - - # Make sure we actually get the fixed context size per tok padding - num_pads = toks.count(tokenizer.encode(IMG_PAD)[0]) - if num_pads != (num_images * MAX_QWEN_IMG_TOKENS): - raise ValueError( - f"Tokenized dummy data should encode {MAX_QWEN_IMG_TOKENS} pads" - f" per image, but got {num_pads} pads for {num_images} image(s)" - " in total. Are you using a qwen tokenizer?") - - # Ensure the number of tokens is at minimum the sequence length provided - if len(toks) < seq_len: - toks += [0] * (seq_len - len(toks)) - - seq_data = SequenceData.from_seqs(toks) - - # Build the input images; width/height doesn't actually matter here since - # the data will get resized and the # of tokens per image is constant - image = Image.new("RGB", (224, 224), color=0) - mm_data = {"image": image if num_images == 1 else [image] * num_images} - return DummyData(seq_data, mm_data) + + def __init__( + self, + config: PretrainedConfig, + tokenizer: PreTrainedTokenizer, + ) -> None: + super().__init__() + + self.config = config + self.tokenizer = tokenizer + + if hasattr(self.config, "visual"): + self.image_transform = build_normalization_transform( + config.visual["image_size"]) + else: + self.image_transform = None + + special_tokens: dict[str, + int] = tokenizer.special_tokens # type: ignore + self.img_start_id = special_tokens[IMG_START] + self.img_end_id = special_tokens[IMG_END] + + def __call__( + self, + text: Optional[Union[TextInput, list[TextInput]]] = None, + images: Optional[Union[ImageInput, list[ImageInput]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + ) -> BatchFeature: + if text is None: + text = [] + if not isinstance(text, list): + text = [text] + if images is None: + images = [] + if not isinstance(images, list): + images = [images] + + text_inputs = self.tokenizer(text) + + if len(images) == 0: + image_inputs = {} + else: + if self.image_transform is None: + raise ValueError("This model does not support image inputs") + + pixel_values = [self.image_transform(image) for image in images] + image_inputs = {"pixel_values": torch.stack(pixel_values)} + + return BatchFeature( + { + **text_inputs, + **image_inputs, + }, + tensor_type=return_tensors, + ) + + +class QWenVLProcessingInfo(BaseProcessingInfo): + + def get_tokenizer(self) -> PreTrainedTokenizer: + tokenizer = self.ctx.tokenizer + assert isinstance(tokenizer, PreTrainedTokenizer) + + return _get_tokenizer_without_image_pad(tokenizer) + + def get_hf_processor(self) -> QWenVLProcessor: + tokenizer = self.ctx.tokenizer + assert isinstance(tokenizer, PreTrainedTokenizer) + + return QWenVLProcessor(self.get_hf_config(), tokenizer) + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None} + + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + return {"image": self.get_num_image_tokens()} + + def get_num_image_tokens(self) -> int: + return MAX_QWEN_IMG_TOKENS + + +class QWenVLDummyInputsBuilder(BaseDummyInputsBuilder[QWenVLProcessingInfo]): + + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + hf_config = self.info.get_hf_config() + if not hasattr(hf_config, "visual"): + return ProcessorInputs(prompt_text="", mm_data={}) + + vision_config = hf_config.visual + + max_image_size = vision_config["image_size"] + num_images = mm_counts.get("image", 0) + + mm_data = { + "image": + self._get_dummy_images(width=max_image_size, + height=max_image_size, + num_images=num_images) + } + + return ProcessorInputs( + prompt_text="".join(f"Picture {i}: {IMG_START}{IMG_END}\n" + for i in range(1, num_images + 1)), + mm_data=mm_data, + ) + + +class QWenVLMultiModalProcessor(BaseMultiModalProcessor[QWenVLProcessingInfo]): + + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> BatchFeature: + # Drops anything between / tags; encoding with the tokenizer + # will automatically add the image pads for the context. + prompt, num_matched_images = re.subn( + r"(Picture \d*: ).*?(<\/img>\n)", + r"\1\2", + prompt, + ) + + image_data = mm_data.get("images") + if image_data is not None: + assert isinstance(image_data, list) + + num_images = len(image_data) + if num_matched_images != num_images: + logger.warning( + "Number of matched image placeholders %s doesn't match " + "the number of expected images %s; check your placeholder " + "formatting.", num_matched_images, num_images) + + return super()._call_hf_processor( + prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs, + ) + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + ) + + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + tokenizer = self.info.get_tokenizer() + special_tokens: dict[str, + int] = tokenizer.special_tokens # type: ignore + + img_start_id = special_tokens[IMG_START] + img_end_id = special_tokens[IMG_END] + img_pad_id = special_tokens[IMG_PAD] + + num_image_tokens = self.info.get_num_image_tokens() + image_tokens = [img_pad_id] * num_image_tokens + + return [ + PromptReplacement( + modality="image", + target=[img_start_id, img_end_id], + replacement=PromptReplacementDetails( + full=[img_start_id] + image_tokens + [img_end_id], + features=image_tokens, + ), + ) + ] class QWenBaseModel(nn.Module, SupportsPP, SupportsLoRA): @@ -898,38 +909,77 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) - def _get_image_input_type( - self, - pixel_values: Optional[torch.Tensor]) -> Optional[QwenImageInputs]: - """Determines if the provided pixel_values are normalized pixel values - or image embeddings. + def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: + h = w = self.config.visual["image_size"] + expected_dims = (3, h, w) + actual_dims = tuple(data.shape[1:]) - Args: - pixel_values: Optional data to processed into visual embeddings. + if actual_dims != expected_dims: + expected_expr = ("batch_size", *map(str, expected_dims)) + raise ValueError( + f"The expected shape of pixel values is {expected_expr}. " + f"You supplied {tuple(data.shape)}.") + + return data + + def _parse_and_validate_image_input( + self, **kwargs: object) -> Optional[QwenImageInputs]: + pixel_values = kwargs.pop("pixel_values", None) + image_embeds = kwargs.pop("image_embeds", None) + + if pixel_values is not None: + if not isinstance(pixel_values, torch.Tensor): + raise ValueError("Incorrect type of pixel values. " + f"Got type: {type(pixel_values)}") + + return QwenImagePixelInputs( + type="pixel_values", + data=self._validate_pixel_values( + flatten_bn(pixel_values, concat=True)), + ) + + if image_embeds is not None: + if not isinstance(image_embeds, torch.Tensor): + raise ValueError("Incorrect type of image embeddings. " + f"Got type: {type(image_embeds)}") + + return QwenImageEmbeddingInputs( + type="image_embeds", + data=flatten_bn(image_embeds), + ) - Returns: - None of the QwenImageInputs type used to determine whether or not - the visual transformer needs to process the pixel_values. - """ - if pixel_values is not None and self.transformer.visual is not None: - pixel_values = flatten_bn(pixel_values) - if len(pixel_values.shape) == 3 and pixel_values.shape[ - 1] == MAX_QWEN_IMG_TOKENS and pixel_values.shape[ - 2] == self.config.visual["output_dim"]: - return QwenImageEmbeddingInputs( - type="image_embeds", - data=pixel_values, - ) - else: - # If we have the wrong shape, assume we still need to process - return QwenImagePixelInputs( - type="pixel_values", - data=pixel_values, - ) return None - def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.transformer.get_input_embeddings(input_ids) + def _process_image_input(self, + image_input: QwenImageInputs) -> torch.Tensor: + if image_input["type"] == "image_embeds": + return image_input["data"] + + assert self.transformer.visual is not None + return self.transformer.visual(image_input["data"]) + + def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None + + vision_embeddings = self._process_image_input(image_input) + return vision_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + inputs_embeds = self.transformer.get_input_embeddings(input_ids) + + if multimodal_embeddings is not None: + assert self.transformer.visual is not None + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, multimodal_embeddings, + self.transformer.visual.image_pad_id) + + return inputs_embeds def forward( self, @@ -938,18 +988,23 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, - pixel_values: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None, + **kwargs: object, ) -> Union[torch.Tensor, IntermediateTensors]: if intermediate_tensors is not None: + inputs_embeds = None + + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility. + elif inputs_embeds is None: + vision_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) input_ids = None - pixel_values = None - else: - pixel_values = self._get_image_input_type(pixel_values) hidden_states = self.transformer(input_ids, positions, kv_caches, attn_metadata, intermediate_tensors, - pixel_values, inputs_embeds) + inputs_embeds) return hidden_states def compute_logits( @@ -1063,10 +1118,9 @@ def get_mm_mapping(self) -> MultiModelKeys: tower_model="transformer.visual.transformer") -@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_qwen) -@MULTIMODAL_REGISTRY.register_max_image_tokens(MAX_QWEN_IMG_TOKENS) -@INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen) -@INPUT_REGISTRY.register_input_processor(input_processor_for_qwen) +@MULTIMODAL_REGISTRY.register_processor(QWenVLMultiModalProcessor, + info=QWenVLProcessingInfo, + dummy_inputs=QWenVLDummyInputsBuilder) class QWenLMHeadModel(QWenBaseModel, SupportsMultiModal, SupportsLoRA): """ QWenLMHeadModel is not only applicable to LLM but also to VL, which is not @@ -1084,7 +1138,7 @@ def __new__( cls, vllm_config: VllmConfig, prefix: str = "", - ) -> None: + ) -> QWenBaseModel: config = vllm_config.model_config.hf_config # Initialize VL if hasattr(config, "visual"): From 925d2f19089b50736ce5e0f2ba0c9b7f3da6fb15 Mon Sep 17 00:00:00 2001 From: Jun Duan Date: Tue, 28 Jan 2025 11:37:10 -0500 Subject: [PATCH 19/69] [Doc] Fix typo for x86 CPU installation (#12514) Signed-off-by: Jun Duan --- docs/source/getting_started/installation/cpu/x86.inc.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/getting_started/installation/cpu/x86.inc.md b/docs/source/getting_started/installation/cpu/x86.inc.md index e4f99d3cebdf2..e0eaac5099305 100644 --- a/docs/source/getting_started/installation/cpu/x86.inc.md +++ b/docs/source/getting_started/installation/cpu/x86.inc.md @@ -18,7 +18,7 @@ vLLM initially supports basic model inferencing and serving on x86 CPU platform, ::: ```{note} -- AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16. +- AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, which brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16. - If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable `VLLM_CPU_AVX512BF16=1` before the building. ``` From 3fd1fb63efb6c96f30237b12e2816b4f2c5323d0 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Tue, 28 Jan 2025 16:38:38 +0000 Subject: [PATCH 20/69] [V1][Metrics] Hook up IterationStats for Prometheus metrics (#12478) Signed-off-by: Mark McLoughlin --- tests/entrypoints/openai/test_metrics.py | 7 ++- vllm/v1/engine/async_llm.py | 3 +- vllm/v1/metrics/loggers.py | 68 ++++++++++++++++++++---- 3 files changed, 66 insertions(+), 12 deletions(-) diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 469a5fb039fb6..64deaedf0f2c1 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -105,8 +105,6 @@ async def client(server): @pytest.mark.asyncio async def test_metrics_counts(server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool): - if use_v1: - pytest.skip("Skipping test on vllm V1") for _ in range(_NUM_REQUESTS): # sending a request triggers the metrics to be logged. await client.completions.create( @@ -120,6 +118,9 @@ async def test_metrics_counts(server: RemoteOpenAIServer, # Loop over all expected metric_families for metric_family, suffix_values_list in EXPECTED_VALUES.items(): + if use_v1 and metric_family not in EXPECTED_METRICS_V1: + continue + found_metric = False # Check to see if the metric_family is found in the prom endpoint. @@ -199,6 +200,8 @@ async def test_metrics_counts(server: RemoteOpenAIServer, EXPECTED_METRICS_V1 = [ "vllm:num_requests_running", "vllm:num_requests_waiting", + "vllm:prompt_tokens_total", + "vllm:generation_tokens_total", ] diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 917d52d3220b8..022b6d0668e99 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -305,7 +305,8 @@ def _log_stats( return for logger in self.stat_loggers: - logger.log(scheduler_stats=scheduler_stats) + logger.log(scheduler_stats=scheduler_stats, + iteration_stats=iteration_stats) def encode( self, diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index b84f03fa3267c..6a7bb423749e1 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -1,11 +1,12 @@ import time from abc import ABC, abstractmethod -from typing import Dict +from typing import Dict, List +import numpy as np import prometheus_client from vllm.logger import init_logger -from vllm.v1.metrics.stats import SchedulerStats +from vllm.v1.metrics.stats import IterationStats, SchedulerStats logger = init_logger(__name__) @@ -15,27 +16,61 @@ class StatLoggerBase(ABC): @abstractmethod - def log(self, scheduler_stats: SchedulerStats): + def log(self, scheduler_stats: SchedulerStats, + iteration_stats: IterationStats): ... class LoggingStatLogger(StatLoggerBase): def __init__(self): - self.last_log_time = time.monotonic() + self._reset(time.monotonic()) - def log(self, scheduler_stats: SchedulerStats): - """Log Stats to standard output.""" + def _reset(self, now): + self.last_log_time = now + + # Tracked stats over current local logging interval. + self.num_prompt_tokens: List[int] = [] + self.num_generation_tokens: List[int] = [] + def _local_interval_elapsed(self, now: float) -> bool: # Log every _LOCAL_LOGGING_INTERVAL_SEC. + elapsed_time = now - self.last_log_time + return elapsed_time > _LOCAL_LOGGING_INTERVAL_SEC + + def _track_iteration_stats(self, iteration_stats: IterationStats): + # Save tracked stats for token counters. + self.num_prompt_tokens.append(iteration_stats.num_prompt_tokens) + self.num_generation_tokens.append( + iteration_stats.num_generation_tokens) + + def _get_throughput(self, tracked_stats: List[int], now: float) -> float: + # Compute summary metrics for tracked stats + return float(np.sum(tracked_stats) / (now - self.last_log_time)) + + def log(self, scheduler_stats: SchedulerStats, + iteration_stats: IterationStats): + """Log Stats to standard output.""" + + self._track_iteration_stats(iteration_stats) + now = time.monotonic() - if now - self.last_log_time < _LOCAL_LOGGING_INTERVAL_SEC: + if not self._local_interval_elapsed(now): return - self.last_log_time = now + + prompt_throughput = self._get_throughput(self.num_prompt_tokens, now) + generation_throughput = self._get_throughput( + self.num_generation_tokens, now) + + self._reset(now) # Format and print output. logger.info( + "Avg prompt throughput: %.1f tokens/s, " + "Avg generation throughput: %.1f tokens/s, " "Running: %d reqs, Waiting: %d reqs ", + prompt_throughput, + generation_throughput, scheduler_stats.num_running_reqs, scheduler_stats.num_waiting_reqs, ) @@ -61,11 +96,26 @@ def __init__(self, labels: Dict[str, str]): documentation="Number of requests waiting to be processed.", labelnames=labelnames).labels(*labelvalues) - def log(self, scheduler_stats: SchedulerStats): + self.counter_prompt_tokens = prometheus_client.Counter( + name="vllm:prompt_tokens_total", + documentation="Number of prefill tokens processed.", + labelnames=labelnames).labels(*labelvalues) + + self.counter_generation_tokens = prometheus_client.Counter( + name="vllm:generation_tokens_total", + documentation="Number of generation tokens processed.", + labelnames=labelnames).labels(*labelvalues) + + def log(self, scheduler_stats: SchedulerStats, + iteration_stats: IterationStats): """Log to prometheus.""" self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs) self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs) + self.counter_prompt_tokens.inc(iteration_stats.num_prompt_tokens) + self.counter_generation_tokens.inc( + iteration_stats.num_generation_tokens) + @staticmethod def _unregister_vllm_metrics(): # Unregister any existing vLLM collectors (for CI/CD From 0f657bdc52d4ad1d079beddf8e7556c419aca7b4 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 28 Jan 2025 14:06:32 -0500 Subject: [PATCH 21/69] Replace missed warning_once for rerank API (#12472) Signed-off-by: mgoin --- vllm/entrypoints/openai/api_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 45cf06566faaa..077bc993726ae 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -528,7 +528,7 @@ async def do_rerank(request: RerankRequest, raw_request: Request): @router.post("/v1/rerank") @with_cancellation async def do_rerank_v1(request: RerankRequest, raw_request: Request): - logger.warning( + logger.warning_once( "To indicate that the rerank API is not part of the standard OpenAI" " API, we have located it at `/rerank`. Please update your client" "accordingly. (Note: Conforms to JinaAI rerank API)") From f26d790718b8e50a11a366f3301b6a9300377797 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 28 Jan 2025 20:05:27 +0000 Subject: [PATCH 22/69] Do not run `suggestion` `pre-commit` hook multiple times (#12521) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .pre-commit-config.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7b32df90bfd8b..77010090965d4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -90,3 +90,4 @@ repos: entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."' language: system verbose: true + pass_filenames: false From c386c43ca3a7156a953e0ca4d8f2c2f36ccf1423 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Tue, 28 Jan 2025 22:07:22 +0000 Subject: [PATCH 23/69] [V1][Metrics] Add per-request prompt/generation_tokens histograms (#12516) Signed-off-by: Mark McLoughlin --- tests/entrypoints/openai/test_metrics.py | 6 +++ vllm/v1/engine/async_llm.py | 3 +- vllm/v1/engine/output_processor.py | 11 ++++- vllm/v1/metrics/loggers.py | 60 +++++++++++++++++++++--- vllm/v1/metrics/stats.py | 36 ++++++++++++-- 5 files changed, 102 insertions(+), 14 deletions(-) diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 64deaedf0f2c1..9a84c82b62fdf 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -202,6 +202,12 @@ async def test_metrics_counts(server: RemoteOpenAIServer, "vllm:num_requests_waiting", "vllm:prompt_tokens_total", "vllm:generation_tokens_total", + "vllm:request_prompt_tokens_sum", + "vllm:request_prompt_tokens_bucket", + "vllm:request_prompt_tokens_count", + "vllm:request_generation_tokens_sum", + "vllm:request_generation_tokens_bucket", + "vllm:request_generation_tokens_count", ] diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 022b6d0668e99..b9dc3561d1750 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -53,8 +53,7 @@ def __init__( self.log_stats = log_stats self.stat_loggers: List[StatLoggerBase] = [ LoggingStatLogger(), - PrometheusStatLogger(labels=dict( - model_name=self.model_config.served_model_name)), + PrometheusStatLogger(vllm_config.model_config), ] # Tokenizer (+ ensure liveness if running in another process). diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 564eab51bd3a8..39217b8090140 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -8,7 +8,7 @@ from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest from vllm.v1.engine.detokenizer import (DetokenizerOutput, IncrementalDetokenizer) -from vllm.v1.metrics.stats import IterationStats +from vllm.v1.metrics.stats import IterationStats, RequestStateStats @dataclass @@ -37,6 +37,8 @@ def __init__( self.is_prefilling = True self.queue = queue + self.stats = RequestStateStats() + @classmethod def from_new_request( cls, @@ -146,7 +148,8 @@ def process_outputs( # 1) Compute stats for this iteration. iteration_stats.update_from_output(engine_core_output, req_state.is_prefilling, - req_state.prompt_len) + req_state.prompt_len, + req_state.stats) req_state.is_prefilling = False # 2) Detokenize the token ids into text. @@ -171,6 +174,10 @@ def process_outputs( # detected stop string, abort needed in EngineCore. reqs_to_abort.append(req_id) + # Track per-request stats + iteration_stats.update_from_finished_request( + request_output, req_state.stats) + return OutputProcessorOutput( request_outputs=request_outputs, reqs_to_abort=reqs_to_abort, diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 6a7bb423749e1..87d9d63652c05 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -1,10 +1,11 @@ import time from abc import ABC, abstractmethod -from typing import Dict, List +from typing import List import numpy as np import prometheus_client +from vllm.config import ModelConfig from vllm.logger import init_logger from vllm.v1.metrics.stats import IterationStats, SchedulerStats @@ -78,13 +79,13 @@ def log(self, scheduler_stats: SchedulerStats, class PrometheusStatLogger(StatLoggerBase): - def __init__(self, labels: Dict[str, str]): - self.labels = labels + def __init__(self, model_config: ModelConfig): + self._unregister_vllm_metrics() - labelnames = self.labels.keys() - labelvalues = self.labels.values() + labelnames = ["model_name"] + labelvalues = [model_config.served_model_name] - self._unregister_vllm_metrics() + max_model_len = model_config.max_model_len self.gauge_scheduler_running = prometheus_client.Gauge( name="vllm:num_requests_running", @@ -106,6 +107,20 @@ def __init__(self, labels: Dict[str, str]): documentation="Number of generation tokens processed.", labelnames=labelnames).labels(*labelvalues) + self.histogram_num_prompt_tokens_request = \ + prometheus_client.Histogram( + name="vllm:request_prompt_tokens", + documentation="Number of prefill tokens processed.", + buckets=build_1_2_5_buckets(max_model_len), + labelnames=labelnames).labels(*labelvalues) + + self.histogram_num_generation_tokens_request = \ + prometheus_client.Histogram( + name="vllm:request_generation_tokens", + documentation="Number of generation tokens processed.", + buckets=build_1_2_5_buckets(max_model_len), + labelnames=labelnames).labels(*labelvalues) + def log(self, scheduler_stats: SchedulerStats, iteration_stats: IterationStats): """Log to prometheus.""" @@ -116,9 +131,42 @@ def log(self, scheduler_stats: SchedulerStats, self.counter_generation_tokens.inc( iteration_stats.num_generation_tokens) + for finished_request in iteration_stats.finished_requests: + self.histogram_num_prompt_tokens_request.observe( + finished_request.num_prompt_tokens) + self.histogram_num_generation_tokens_request.observe( + finished_request.num_generation_tokens) + @staticmethod def _unregister_vllm_metrics(): # Unregister any existing vLLM collectors (for CI/CD for collector in list(prometheus_client.REGISTRY._collector_to_names): if hasattr(collector, "_name") and "vllm" in collector._name: prometheus_client.REGISTRY.unregister(collector) + + +def build_buckets(mantissa_lst: List[int], max_value: int) -> List[int]: + """ + Builds a list of buckets with increasing powers of 10 multiplied by + mantissa values until the value exceeds the specified maximum. + + """ + exponent = 0 + buckets: List[int] = [] + while True: + for m in mantissa_lst: + value = m * 10**exponent + if value <= max_value: + buckets.append(value) + else: + return buckets + exponent += 1 + + +def build_1_2_5_buckets(max_value: int) -> List[int]: + """ + Example: + >>> build_1_2_5_buckets(100) + [1, 2, 5, 10, 20, 50, 100] + """ + return build_buckets([1, 2, 5], max_value) diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 60cb986f8bbce..55d85a7992cc5 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -1,7 +1,8 @@ from dataclasses import dataclass -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, List if TYPE_CHECKING: + from vllm.outputs import RequestOutput from vllm.v1.engine import EngineCoreOutput @@ -16,6 +17,21 @@ class SchedulerStats: # gpu_prefix_cache_hit_rate: float = 0.0 +@dataclass +class RequestStateStats: + """Stats that need to be tracked across delta updates.""" + + num_generation_tokens: int = 0 + + +@dataclass +class FinishedRequestStats: + """Stats associated with a finished request.""" + + num_prompt_tokens: int = 0 + num_generation_tokens: int = 0 + + class IterationStats: """Stats associated with a single set of EngineCoreOutputs.""" @@ -23,17 +39,29 @@ def __init__(self, log_stats: bool): self.log_stats = log_stats self.num_generation_tokens = 0 self.num_prompt_tokens = 0 + self.finished_requests: List[FinishedRequestStats] = [] def update_from_output(self, output: "EngineCoreOutput", - is_prefilling: bool, prompt_len: int): + is_prefilling: bool, prompt_len: int, + request_state_stats: RequestStateStats): if not self.log_stats: return - self.num_generation_tokens += len(output.new_token_ids) + num_new_generation_tokens = len(output.new_token_ids) + + self.num_generation_tokens += num_new_generation_tokens if is_prefilling: # This relies on the invariant that EngineCore does # not stream outputs for partially completed prefills # (scheduler.update_from_output makes EngineCoreOutput # iff num_computed_tokens == num_tokens). - assert (len(output.new_token_ids) > 0) + assert (num_new_generation_tokens > 0) self.num_prompt_tokens += prompt_len + + request_state_stats.num_generation_tokens += num_new_generation_tokens + + def update_from_finished_request(self, request_output: "RequestOutput", + request_state_stats: RequestStateStats): + self.finished_requests.append( + FinishedRequestStats(len(request_output.prompt_token_ids), + request_state_stats.num_generation_tokens)) From 80fcc3ed1c940ea43e1b495bbdf8b9765f837128 Mon Sep 17 00:00:00 2001 From: fenghuizhang <159459388+fenghuizhang@users.noreply.github.com> Date: Tue, 28 Jan 2025 14:36:44 -0800 Subject: [PATCH 24/69] [Kernel] Pipe attn_logits_soft_cap through paged attention TPU kernels (#12482) Signed-off-by: Fenghui Zhang --- .buildkite/run-tpu-test.sh | 0 vllm/attention/backends/pallas.py | 42 ++++++++++++------------------- 2 files changed, 16 insertions(+), 26 deletions(-) mode change 100644 => 100755 .buildkite/run-tpu-test.sh diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh old mode 100644 new mode 100755 diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py index facdee6b29e39..209a623ba441c 100644 --- a/vllm/attention/backends/pallas.py +++ b/vllm/attention/backends/pallas.py @@ -110,6 +110,7 @@ def __init__( assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads + self.logits_soft_cap = logits_soft_cap if head_size % 128 != 0: raise NotImplementedError("Head size must be a multiple of 128.") if alibi_slopes is not None: @@ -120,9 +121,6 @@ def __init__( raise NotImplementedError("FP8 KV cache dtype is not supported.") if blocksparse_params is not None: raise NotImplementedError("Blocksparse is not supported.") - if logits_soft_cap is not None: - raise NotImplementedError( - "Attention logits soft-capping is not supported.") if torch_xla.tpu.version() < 4: raise NotImplementedError("TPU version must be 4 or higher.") @@ -230,6 +228,7 @@ def forward( num_kv_pages_per_compute_block, num_queries_per_compute_block, use_kernel=True, + attn_logits_soft_cap=self.logits_soft_cap, ) else: # Decoding run. @@ -257,6 +256,7 @@ def forward( attn_metadata.block_tables, pages_per_compute_block, self.megacore_mode, + attn_logits_soft_cap=self.logits_soft_cap, ) else: chunk_size = max_num_seq @@ -280,6 +280,7 @@ def forward( attn_metadata.block_tables[chunk_start:chunk_end], pages_per_compute_block, self.megacore_mode, + attn_logits_soft_cap=self.logits_soft_cap, ) output[chunk_start:chunk_end] = chunk_output @@ -313,6 +314,8 @@ def paged_attention( block_tables: torch.Tensor, pages_per_compute_block: int, megacore_mode: Optional[str], + *, + attn_logits_soft_cap: Optional[float], ) -> torch.Tensor: batch_size = query.shape[0] if megacore_mode == "batch" and batch_size % 2 != 0: @@ -320,26 +323,13 @@ def paged_attention( else: megacore_mode = megacore_mode - # NOTE(woosuk): A temporary workaround to avoid the error: - # "xla::paged_attention() Expected a value of type 'str' for - # argument 'megacore_mode' but instead found type 'NoneType'." - if megacore_mode is not None: - output = torch.ops.xla.paged_attention( - query, - key_cache, - value_cache, - context_lens, - block_tables, - pages_per_compute_block, - megacore_mode=megacore_mode, - ) - else: - output = torch.ops.xla.paged_attention( - query, - key_cache, - value_cache, - context_lens, - block_tables, - pages_per_compute_block, - ) - return output + return torch.ops.xla.paged_attention( + query, + key_cache, + value_cache, + context_lens, + block_tables, + pages_per_compute_block, + megacore_mode=megacore_mode, + attn_logits_soft_cap=attn_logits_soft_cap, + ) From fbb5bd4cefd62e3e389e2b873d5859eb8e07cbfa Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 28 Jan 2025 22:16:47 -0500 Subject: [PATCH 25/69] [TPU] Add example for profiling TPU inference (#12531) Signed-off-by: mgoin --- .../offline_inference/profiling_tpu/README.md | 67 ++++++++++++ .../profiling_tpu/profiling.py | 101 ++++++++++++++++++ 2 files changed, 168 insertions(+) create mode 100644 examples/offline_inference/profiling_tpu/README.md create mode 100644 examples/offline_inference/profiling_tpu/profiling.py diff --git a/examples/offline_inference/profiling_tpu/README.md b/examples/offline_inference/profiling_tpu/README.md new file mode 100644 index 0000000000000..08efa63dc1021 --- /dev/null +++ b/examples/offline_inference/profiling_tpu/README.md @@ -0,0 +1,67 @@ +# vLLM TPU Profiling + +This script is used to profile the TPU performance of vLLM for specific prefill or decode token shapes. + +Note: an actual running server is a mix of both prefill of many shapes and decode of many shapes. + +We assume you are on a TPU already (this was tested on TPU v6e) and have installed vLLM according to the [installation guide](https://docs.vllm.ai/en/latest/getting_started/installation/ai_accelerator/index.html). + +> In all examples below, we run several warmups before (so `--enforce-eager` is okay) + +## Profile Examples + +### Generate Prefill Trace + +This example runs Qwen/Qwen2.5-7B-Instruct with a single request of 1024 input tokens. This is set up in attempt to profile just the prefill time and operations. + +```bash +export XLA_HLO_DEBUG=1 +export MODEL=Qwen/Qwen2.5-7B-Instruct +export VLLM_TPU_PROFILE_DURATION_MS=3000 +export VLLM_TPU_PROFILE_DELAY_MS=0 + +python3 profiling.py \ + --model $MODEL \ + --input-len 1024 --output-len 1 \ + --batch-size 1 --enforce-eager \ + --max-model-len 2048 \ + --tensor-parallel-size 1 \ + --profile-result-dir profiles +``` + + +### Generate Decode Trace + +This example runs Llama 3.1 70B with a batch of 32 requests where each has 1 input token and 128 output tokens. This is set up in attempt to profile just the 32 decodes running in parallel by having an extremely small prefill of 1 token and setting `VLLM_TPU_PROFILE_DELAY_MS=1000` to skip the first second of inference (hopefully prefill). + +```bash +export XLA_HLO_DEBUG=1 +export MODEL=meta-llama/Llama-3.1-70B-Instruct +export VLLM_TPU_PROFILE_DURATION_MS=2000 +export VLLM_TPU_PROFILE_DELAY_MS=1000 + +rm -rf ~/.cache/vllm/xla_cache +python3 profiling.py \ + --model $MODEL \ + --input-len 1 \ + --output-len 128 \ + --batch-size 32 \ + --enforce-eager \ + --profile-result-dir profiles \ + --max-model-len 2048 --tensor-parallel-size 8 +``` + + +## Visualizing the profiles + +Once you have collected your profiles with this script, you can visualize them using [TensorBoard](https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm). + +Here are most likely the dependencies you need to install: +```bash +pip install tensorflow-cpu tensorboard-plugin-profile etils importlib_resources +``` + +Then you just need to point TensorBoard to the directory where you saved the profiles and visit `http://localhost:6006/` in your browser: +```bash +tensorboard --logdir profiles/ --port 6006 +``` \ No newline at end of file diff --git a/examples/offline_inference/profiling_tpu/profiling.py b/examples/offline_inference/profiling_tpu/profiling.py new file mode 100644 index 0000000000000..d7423e6c6da93 --- /dev/null +++ b/examples/offline_inference/profiling_tpu/profiling.py @@ -0,0 +1,101 @@ +import argparse +import dataclasses +import os +import time +from typing import List + +import numpy as np +import torch_xla.debug.profiler as xp +from tqdm import tqdm + +from vllm import LLM, SamplingParams +from vllm.engine.arg_utils import EngineArgs +from vllm.inputs import PromptType +from vllm.utils import FlexibleArgumentParser + +DURATION_MS = int(os.getenv("VLLM_TPU_PROFILE_DURATION_MS", 3000)) +DELAY_MS = int(os.getenv("VLLM_TPU_PROFILE_DELAY_MS", 0)) + + +def main(args: argparse.Namespace): + print(args) + + engine_args = EngineArgs.from_cli_args(args) + llm = LLM(**dataclasses.asdict(engine_args)) + _ = xp.start_server(9012) + + sampling_params = SamplingParams( + temperature=0.0, + ignore_eos=True, + max_tokens=args.output_len, + ) + print(sampling_params) + dummy_prompt_token_ids = np.random.randint(10000, + size=(args.batch_size, + args.input_len)) + dummy_prompts: List[PromptType] = [{ + "prompt_token_ids": batch + } for batch in dummy_prompt_token_ids.tolist()] + + def run_to_completion(): + start_time = time.perf_counter() + llm.generate(dummy_prompts, + sampling_params=sampling_params, + use_tqdm=False) + end_time = time.perf_counter() + latency = end_time - start_time + return latency + + # Warmup + print("Warming up...") + warmup_latencies = [] + for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"): + warmup_latencies.append(run_to_completion()) + print(f"Average warmup latency: {np.mean(warmup_latencies):.4f}s") + + # Profile + profile_dir = args.profile_result_dir + print(f"Profiling (results will be saved to '{profile_dir}')...") + # Enable tracing on server + xp.trace_detached("localhost:9012", + profile_dir, + delay_ms=DELAY_MS, + duration_ms=DURATION_MS) + if DELAY_MS == 0: + time.sleep(1.0) + profile_latencies = [] + for _ in tqdm(range(args.num_iters), desc="Profile iterations"): + profile_latencies.append(run_to_completion()) + print(f"Average profile latency: {np.mean(profile_latencies):.4f}s") + + return + + +if __name__ == '__main__': + parser = FlexibleArgumentParser( + description='Benchmark the latency of processing a single batch of ' + 'requests till completion.') + parser.add_argument('--input-len', type=int, default=32) + parser.add_argument('--output-len', type=int, default=128) + parser.add_argument('--batch-size', type=int, default=8) + parser.add_argument('--num-iters-warmup', + type=int, + default=5, + help='Number of iterations to run for warmup.') + parser.add_argument('--num-iters', + type=int, + default=1, + help='Number of iterations to run for profiling.') + parser.add_argument( + '--profile-result-dir', + type=str, + default="profiles", + help= + ('path to save the pytorch profiler output. Can be visualized ' + 'with ui.perfetto.dev or Tensorboard ' + '(https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm).' + )) + + parser = EngineArgs.add_cli_args(parser) + args = parser.parse_args() + main(args) From a7e3eba66fff82f7e12bb2354c4b26635f0f7761 Mon Sep 17 00:00:00 2001 From: Ce Gao Date: Wed, 29 Jan 2025 11:38:08 +0800 Subject: [PATCH 26/69] [Frontend] Support reasoning content for deepseek r1 (#12473) Signed-off-by: Ce Gao Co-authored-by: Rafael Vasquez Co-authored-by: Cyrus Leung Co-authored-by: Michael Goin --- docs/source/features/reasoning_outputs.md | 151 +++++++++++++++++ docs/source/index.md | 1 + .../openai_chat_completion_with_reasoning.py | 53 ++++++ ...hat_completion_with_reasoning_streaming.py | 90 ++++++++++ .../openai/reasoning_parsers/__init__.py | 0 .../test_deepseekr1_reasoning_parser.py | 120 +++++++++++++ .../openai/reasoning_parsers/utils.py | 93 +++++++++++ tests/entrypoints/openai/test_cli_args.py | 29 ++++ vllm/entrypoints/openai/api_server.py | 10 ++ vllm/entrypoints/openai/cli_args.py | 30 ++++ vllm/entrypoints/openai/protocol.py | 2 + .../openai/reasoning_parsers/__init__.py | 6 + .../abs_reasoning_parsers.py | 158 ++++++++++++++++++ .../deepseek_r1_reasoning_parser.py | 133 +++++++++++++++ vllm/entrypoints/openai/serving_chat.py | 105 +++++++++++- vllm/scripts.py | 1 + 16 files changed, 977 insertions(+), 5 deletions(-) create mode 100644 docs/source/features/reasoning_outputs.md create mode 100644 examples/online_serving/openai_chat_completion_with_reasoning.py create mode 100644 examples/online_serving/openai_chat_completion_with_reasoning_streaming.py create mode 100644 tests/entrypoints/openai/reasoning_parsers/__init__.py create mode 100644 tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py create mode 100644 tests/entrypoints/openai/reasoning_parsers/utils.py create mode 100644 vllm/entrypoints/openai/reasoning_parsers/__init__.py create mode 100644 vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py create mode 100644 vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py diff --git a/docs/source/features/reasoning_outputs.md b/docs/source/features/reasoning_outputs.md new file mode 100644 index 0000000000000..e39bbacf1138d --- /dev/null +++ b/docs/source/features/reasoning_outputs.md @@ -0,0 +1,151 @@ +(reasoning-outputs)= + +# Reasoning Outputs + +vLLM offers support for reasoning models like [DeepSeek R1](https://huggingface.co/deepseek-ai/DeepSeek-R1), which are designed to generate outputs containing both reasoning steps and final conclusions. + +Reasoning models return a additional `reasoning_content` field in their outputs, which contains the reasoning steps that led to the final conclusion. This field is not present in the outputs of other models. + +## Supported Models + +vLLM currently supports the following reasoning models: + +- [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) (`deepseek_r1`, which looks for ` ... `) + +## Quickstart + +To use reasoning models, you need to specify the `--enable-reasoning` and `--reasoning-parser` flags when making a request to the chat completion endpoint. The `--reasoning-parser` flag specifies the reasoning parser to use for extracting reasoning content from the model output. + +```bash +vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \ + --enable-reasoning --reasoning-parser deepseek_r1 +``` + +Next, make a request to the model that should return the reasoning content in the response. + +```python +from openai import OpenAI + +# Modify OpenAI's API key and API base to use vLLM's API server. +openai_api_key = "EMPTY" +openai_api_base = "http://localhost:8000/v1" + +client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, +) + +models = client.models.list() +model = models.data[0].id + +# Round 1 +messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}] +response = client.chat.completions.create(model=model, messages=messages) + +reasoning_content = response.choices[0].message.reasoning_content +content = response.choices[0].message.content + +print("reasoning_content:", reasoning_content) +print("content:", content) +``` + +The `reasoning_content` field contains the reasoning steps that led to the final conclusion, while the `content` field contains the final conclusion. + +## Streaming chat completions + +Streaming chat completions are also supported for reasoning models. The `reasoning_content` field is available in the `delta` field in [chat completion response chunks](https://platform.openai.com/docs/api-reference/chat/streaming). + +```json +{ + "id": "chatcmpl-123", + "object": "chat.completion.chunk", + "created": 1694268190, + "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "system_fingerprint": "fp_44709d6fcb", + "choices": [ + { + "index": 0, + "delta": { + "role": "assistant", + "reasoning_content": "is", + }, + "logprobs": null, + "finish_reason": null + } + ] +} +``` + +Please note that it is not compatible with the OpenAI Python client library. You can use the `requests` library to make streaming requests. + +## How to support a new reasoning model + +You can add a new `ReasoningParser` similar to `vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py`. + +```python +# import the required packages + +from vllm.entrypoints.openai.reasoning_parsers.abs_reasoning_parsers import ( + ReasoningParser, ReasoningParserManager) +from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + DeltaMessage) + +# define a reasoning parser and register it to vllm +# the name list in register_module can be used +# in --reasoning-parser. +@ReasoningParserManager.register_module(["example"]) +class ExampleParser(ReasoningParser): + def __init__(self, tokenizer: AnyTokenizer): + super().__init__(tokenizer) + + def extract_reasoning_content_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + ) -> Union[DeltaMessage, None]: + """ + Instance method that should be implemented for extracting reasoning + from an incomplete response; for use when handling reasoning calls and + streaming. Has to be an instance method because it requires state - + the current tokens/diffs, but also the information about what has + previously been parsed and extracted (see constructor) + """ + + def extract_reasoning_content( + self, model_output: str, request: ChatCompletionRequest + ) -> Tuple[Optional[str], Optional[str]]: + """ + Extract reasoning content from a complete model-generated string. + + Used for non-streaming responses where we have the entire model response + available before sending to the client. + + Parameters: + model_output: str + The model-generated string to extract reasoning content from. + + request: ChatCompletionRequest + The request object that was used to generate the model_output. + + Returns: + Tuple[Optional[str], Optional[str]] + A tuple containing the reasoning content and the content. + """ +``` + +After defining the reasoning parser, you can use it by specifying the `--reasoning-parser` flag when making a request to the chat completion endpoint. + +```bash +vllm serve \ + --enable-reasoning --reasoning-parser example +``` + +## Limitations + +- The reasoning content is only available for online serving's chat completion endpoint (`/v1/chat/completions`). +- It is not compatible with the [`structured_outputs`](#structured_outputs) and [`tool_calling`](#tool_calling) features. +- The reasoning content is not available for all models. Check the model's documentation to see if it supports reasoning. diff --git a/docs/source/index.md b/docs/source/index.md index 2c302d3f3e863..6957d5dd0f2e7 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -90,6 +90,7 @@ models/extensions/index features/quantization/index features/lora features/tool_calling +features/reasoning_outputs features/structured_outputs features/automatic_prefix_caching features/disagg_prefill diff --git a/examples/online_serving/openai_chat_completion_with_reasoning.py b/examples/online_serving/openai_chat_completion_with_reasoning.py new file mode 100644 index 0000000000000..83e51a48bcc6b --- /dev/null +++ b/examples/online_serving/openai_chat_completion_with_reasoning.py @@ -0,0 +1,53 @@ +""" +An example shows how to generate chat completions from reasoning models +like DeepSeekR1. + +To run this example, you need to start the vLLM server with the reasoning +parser: + +```bash +vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \ + --enable-reasoning --reasoning-parser deepseek_r1 +``` + +This example demonstrates how to generate chat completions from reasoning models +using the OpenAI Python client library. +""" + +from openai import OpenAI + +# Modify OpenAI's API key and API base to use vLLM's API server. +openai_api_key = "EMPTY" +openai_api_base = "http://localhost:8000/v1" + +client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, +) + +models = client.models.list() +model = models.data[0].id + +# Round 1 +messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}] +response = client.chat.completions.create(model=model, messages=messages) + +reasoning_content = response.choices[0].message.reasoning_content +content = response.choices[0].message.content + +print("reasoning_content:", reasoning_content) +print("content:", content) + +# Round 2 +messages.append({"role": "assistant", "content": content}) +messages.append({ + "role": "user", + "content": "How many Rs are there in the word 'strawberry'?", +}) +response = client.chat.completions.create(model=model, messages=messages) + +reasoning_content = response.choices[0].message.reasoning_content +content = response.choices[0].message.content + +print("reasoning_content:", reasoning_content) +print("content:", content) diff --git a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py new file mode 100644 index 0000000000000..8c14aac6b4ecb --- /dev/null +++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py @@ -0,0 +1,90 @@ +""" +An example shows how to generate chat completions from reasoning models +like DeepSeekR1. + +To run this example, you need to start the vLLM server with the reasoning +parser: + +```bash +vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \ + --enable-reasoning --reasoning-parser deepseek_r1 +``` + +Unlike openai_chat_completion_with_reasoning.py, this example demonstrates the +streaming chat completions feature. + +The streaming chat completions feature allows you to receive chat completions +in real-time as they are generated by the model. This is useful for scenarios +where you want to display chat completions to the user as they are generated +by the model. + +Here we do not use the OpenAI Python client library, because it does not support +`reasoning_content` fields in the response. +""" + +import json + +import requests + +# Modify OpenAI's API key and API base to use vLLM's API server. +openai_api_key = "EMPTY" +openai_api_base = "http://localhost:8000/v1" + +models = requests.get( + f"{openai_api_base}/models", + headers={ + "Authorization": f"Bearer {openai_api_key}" + }, +).json() +model = models["data"][0]["id"] + +# Streaming chat completions +messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}] + +response = requests.post( + f"{openai_api_base}/chat/completions", + headers={"Authorization": f"Bearer {openai_api_key}"}, + json={ + "model": model, + "messages": messages, + "stream": True + }, +) + +print("client: Start streaming chat completions...") +printed_reasoning_content = False +printed_content = False +# Make the streaming request +if response.status_code == 200: + # Process the streaming response + for line in response.iter_lines(): + if line: # Filter out keep-alive new lines + # Decode the line and parse the JSON + decoded_line = line.decode("utf-8") + if decoded_line.startswith("data:"): + data = decoded_line[5:].strip() # Remove "data:" prefix + if data == "[DONE]": # End of stream + print("\nclient: Stream completed.") + break + try: + # Parse the JSON data + chunk = json.loads(data) + reasoning_content = chunk["choices"][0]["delta"].get( + "reasoning_content", "") + content = chunk["choices"][0]["delta"].get("content", "") + + if reasoning_content: + if not printed_reasoning_content: + printed_reasoning_content = True + print("reasoning_content:", end="", flush=True) + print(reasoning_content, end="", flush=True) + elif content: + if not printed_content: + printed_content = True + print("\ncontent:", end="", flush=True) + # Extract and print the content + print(content, end="", flush=True) + except json.JSONDecodeError: + print("Error decoding JSON:", decoded_line) +else: + print(f"Error: {response.status_code} - {response.text}") diff --git a/tests/entrypoints/openai/reasoning_parsers/__init__.py b/tests/entrypoints/openai/reasoning_parsers/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py b/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py new file mode 100644 index 0000000000000..4607e4dfe4d0b --- /dev/null +++ b/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py @@ -0,0 +1,120 @@ +from typing import List + +import pytest +from transformers import AutoTokenizer + +from tests.entrypoints.openai.reasoning_parsers.utils import ( + run_reasoning_extraction) +from vllm.entrypoints.openai.reasoning_parsers import (ReasoningParser, + ReasoningParserManager) + +parser_name = "deepseek_r1" +start_token = "" +end_token = "" + +SIMPLE_REASONING = { + "output": "This is a reasoning sectionThis is the rest", + "reasoning_content": "This is a reasoning section", + "content": "This is the rest", +} +COMPLETE_REASONING = { + "output": "This is a reasoning section", + "reasoning_content": "This is a reasoning section", + "content": None, +} +NO_REASONING = { + "output": "This is a reasoning section", + "reasoning_content": None, + "content": "This is a reasoning section", +} +MULTIPLE_LINES = { + "output": "This\nThatThis is the rest\nThat", + "reasoning_content": "This\nThat", + "content": "This is the rest\nThat", +} +SHORTEST_REASONING_NO_STREAMING = { + "output": "This is the rest", + "reasoning_content": "", + "content": "This is the rest", +} +SHORTEST_REASONING = { + "output": "This is the rest", + "reasoning_content": None, + "content": "This is the rest", +} + +TEST_CASES = [ + pytest.param( + False, + SIMPLE_REASONING, + id="simple_streaming", + ), + pytest.param( + True, + SIMPLE_REASONING, + id="simple_streaming", + ), + pytest.param( + False, + COMPLETE_REASONING, + id="complete_streaming", + ), + pytest.param( + True, + COMPLETE_REASONING, + id="complete_streaming", + ), + pytest.param( + False, + NO_REASONING, + id="no_streaming", + ), + pytest.param( + True, + NO_REASONING, + id="no_streaming", + ), + pytest.param( + False, + MULTIPLE_LINES, + id="multiple_lines_streaming", + ), + pytest.param( + True, + MULTIPLE_LINES, + id="multiple_lines_streaming", + ), + pytest.param( + True, + SHORTEST_REASONING, + id="shortest_streaming", + ), + pytest.param( + False, + SHORTEST_REASONING_NO_STREAMING, + id="shortest_streaming", + ), +] + + +@pytest.mark.parametrize("streaming, param_dict", TEST_CASES) +def test_reasoning( + streaming: bool, + param_dict: dict, +): + tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m") + tokenizer.add_tokens([start_token, end_token]) + output = tokenizer.tokenize(param_dict["output"]) + # decode everything to tokens + output_tokens: List[str] = [ + tokenizer.convert_tokens_to_string([token]) for token in output + ] + parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser( + parser_name)(tokenizer) + + reasoning, content = run_reasoning_extraction(parser, + output_tokens, + streaming=streaming) + + assert reasoning == param_dict["reasoning_content"] + assert content == param_dict["content"] diff --git a/tests/entrypoints/openai/reasoning_parsers/utils.py b/tests/entrypoints/openai/reasoning_parsers/utils.py new file mode 100644 index 0000000000000..ac73ad50a7395 --- /dev/null +++ b/tests/entrypoints/openai/reasoning_parsers/utils.py @@ -0,0 +1,93 @@ +from typing import List, Optional, Tuple, Union + +from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + DeltaMessage) +from vllm.entrypoints.openai.reasoning_parsers import ReasoningParser + + +class StreamingReasoningReconstructor: + + def __init__(self): + self.reasoning_content = None + self.other_content = None + + def append_delta(self, delta: DeltaMessage): + # content and the reasoning content should not be present + # at the same time + assert delta.content is None or delta.reasoning_content is None, ( + "Both content and reasoning content are present in the " + "delta message") + if delta.content is not None: + if self.other_content is None: + self.other_content = delta.content + else: + self.other_content += delta.content + else: + if self.reasoning_content is None: + self.reasoning_content = delta.reasoning_content + else: + self.reasoning_content += delta.reasoning_content + + +def run_reasoning_extraction( + reasoning_parser: ReasoningParser, + model_output: List[str], + request: Union[ChatCompletionRequest, None] = None, + streaming: bool = False, +) -> Tuple[Optional[str], Optional[str]]: + if streaming: + reconstructor = run_reasoning_extraction_streaming( + reasoning_parser, + model_output, + request, + ) + return ( + reconstructor.reasoning_content, + reconstructor.other_content or None, + ) + else: + reasoning, content = run_reasoning_extraction_nonstreaming( + reasoning_parser, model_output, request) + return reasoning, content + + +def run_reasoning_extraction_nonstreaming( + reasoning_parser: ReasoningParser, + model_output: List[str], + request: Union[ChatCompletionRequest, None] = None, +) -> Tuple[Optional[str], Optional[str]]: + request = request or ChatCompletionRequest(messages=[], model="test-model") + return reasoning_parser.extract_reasoning_content( + model_output=''.join(model_output), request=request) + + +def run_reasoning_extraction_streaming( + reasoning_parser: ReasoningParser, + model_deltas: List[str], + request: Union[ChatCompletionRequest, None] = None, +) -> StreamingReasoningReconstructor: + request = request or ChatCompletionRequest(messages=[], model="test-model") + reconstructor = StreamingReasoningReconstructor() + previous_text = "" + previous_tokens: List[int] = [] + for delta in model_deltas: + token_delta = [ + reasoning_parser.vocab.get(token) + for token in reasoning_parser.model_tokenizer.tokenize(delta) + if token in reasoning_parser.vocab + ] + current_text = previous_text + delta + current_tokens = previous_tokens + token_delta + delta_message = reasoning_parser.extract_reasoning_content_streaming( + previous_text, + current_text, + delta, + previous_tokens, + current_tokens, + token_delta, + ) + if delta_message is not None: + reconstructor.append_delta(delta_message) + previous_text = current_text + previous_tokens = current_tokens + return reconstructor diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py index e49562ad6a21f..01bcd78aa91a8 100644 --- a/tests/entrypoints/openai/test_cli_args.py +++ b/tests/entrypoints/openai/test_cli_args.py @@ -116,6 +116,35 @@ def test_enable_auto_choice_passes_with_tool_call_parser(serve_parser): validate_parsed_serve_args(args) +def test_enable_auto_choice_fails_with_enable_reasoning(serve_parser): + """Ensure validation fails if reasoning is enabled with auto tool choice""" + args = serve_parser.parse_args(args=[ + "--enable-auto-tool-choice", + "--enable-reasoning", + ]) + with pytest.raises(TypeError): + validate_parsed_serve_args(args) + + +def test_enable_reasoning_passes_with_reasoning_parser(serve_parser): + """Ensure validation passes if reasoning is enabled + with a reasoning parser""" + args = serve_parser.parse_args(args=[ + "--enable-reasoning", + "--reasoning-parser", + "deepseek_r1", + ]) + validate_parsed_serve_args(args) + + +def test_enable_reasoning_fails_without_reasoning_parser(serve_parser): + """Ensure validation fails if reasoning is enabled + without a reasoning parser""" + args = serve_parser.parse_args(args=["--enable-reasoning"]) + with pytest.raises(TypeError): + validate_parsed_serve_args(args) + + def test_chat_template_validation_for_happy_paths(serve_parser): """Ensure validation passes if the chat template exists""" args = serve_parser.parse_args( diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 077bc993726ae..9e5cf4ba2e490 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -61,6 +61,7 @@ TokenizeRequest, TokenizeResponse, UnloadLoraAdapterRequest) +from vllm.entrypoints.openai.reasoning_parsers import ReasoningParserManager # yapf: enable from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion @@ -771,6 +772,8 @@ async def init_app_state( return_tokens_as_token_ids=args.return_tokens_as_token_ids, enable_auto_tools=args.enable_auto_tool_choice, tool_parser=args.tool_call_parser, + enable_reasoning=args.enable_reasoning, + reasoning_parser=args.reasoning_parser, enable_prompt_tokens_details=args.enable_prompt_tokens_details, ) if model_config.runner_type == "generate" else None state.openai_serving_completion = OpenAIServingCompletion( @@ -844,6 +847,13 @@ async def run_server(args, **uvicorn_kwargs) -> None: raise KeyError(f"invalid tool call parser: {args.tool_call_parser} " f"(chose from {{ {','.join(valid_tool_parses)} }})") + valid_reasoning_parses = ReasoningParserManager.reasoning_parsers.keys() + if args.enable_reasoning \ + and args.reasoning_parser not in valid_reasoning_parses: + raise KeyError( + f"invalid reasoning parser: {args.reasoning_parser} " + f"(chose from {{ {','.join(valid_reasoning_parses)} }})") + # workaround to make sure that we bind the port before the engine is set up. # This avoids race conditions with ray. # see https://github.com/vllm-project/vllm/issues/8204 diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 4df75a665bab9..9cfe07c65d55e 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -12,6 +12,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption, validate_chat_template) +from vllm.entrypoints.openai.reasoning_parsers import ReasoningParserManager from vllm.entrypoints.openai.serving_models import (LoRAModulePath, PromptAdapterPath) from vllm.entrypoints.openai.tool_parsers import ToolParserManager @@ -208,6 +209,23 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: default=False, help="Enable auto tool choice for supported models. Use " "``--tool-call-parser`` to specify which parser to use.") + parser.add_argument( + "--enable-reasoning", + action="store_true", + default=False, + help="Whether to enable reasoning_content for the model. " + "If enabled, the model will be able to generate reasoning content.") + + valid_reasoning_parsers = ReasoningParserManager.reasoning_parsers.keys() + parser.add_argument( + "--reasoning-parser", + type=str, + metavar="{" + ",".join(valid_reasoning_parsers) + "}", + default=None, + help= + "Select the reasoning parser depending on the model that you're using." + " This is used to parse the reasoning content into OpenAI API " + "format. Required for ``--enable-reasoning``.") valid_tool_parsers = ToolParserManager.tool_parsers.keys() parser.add_argument( @@ -267,6 +285,18 @@ def validate_parsed_serve_args(args: argparse.Namespace): raise TypeError("Error: --enable-auto-tool-choice requires " "--tool-call-parser") + # Enable reasoning needs a reasoning parser to be valid + if args.enable_reasoning and not args.reasoning_parser: + raise TypeError("Error: --enable-reasoning requires " + "--reasoning-parser") + + # Ref https://api-docs.deepseek.com/guides/reasoning_model + # tool call and reasoning cannot be enabled at the same time. + if args.enable_auto_tool_choice and args.enable_reasoning: + raise TypeError( + "Error: --enable-auto-tool-choice and " + "--enable-reasoning cannot be enabled at the same time") + def create_parser_for_docs() -> FlexibleArgumentParser: parser_for_docs = FlexibleArgumentParser( diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index f89c3f42aab17..2bc136cc48038 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -1202,6 +1202,7 @@ class ExtractedToolCallInformation(BaseModel): class ChatMessage(OpenAIBaseModel): role: str + reasoning_content: Optional[str] = None content: Optional[str] = None tool_calls: List[ToolCall] = Field(default_factory=list) @@ -1243,6 +1244,7 @@ class ChatCompletionResponse(OpenAIBaseModel): class DeltaMessage(OpenAIBaseModel): role: Optional[str] = None content: Optional[str] = None + reasoning_content: Optional[str] = None tool_calls: List[DeltaToolCall] = Field(default_factory=list) diff --git a/vllm/entrypoints/openai/reasoning_parsers/__init__.py b/vllm/entrypoints/openai/reasoning_parsers/__init__.py new file mode 100644 index 0000000000000..a21bff52f61fa --- /dev/null +++ b/vllm/entrypoints/openai/reasoning_parsers/__init__.py @@ -0,0 +1,6 @@ +from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager +from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser + +__all__ = [ + "ReasoningParser", "ReasoningParserManager", "DeepSeekR1ReasoningParser" +] diff --git a/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py b/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py new file mode 100644 index 0000000000000..e5d10ee0bc3a8 --- /dev/null +++ b/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py @@ -0,0 +1,158 @@ +import os +from functools import cached_property +from typing import Callable, Dict, List, Optional, Sequence, Tuple, Type, Union + +from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + DeltaMessage) +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.utils import import_from_path, is_list_of + +logger = init_logger(__name__) + + +class ReasoningParser: + """ + Abstract reasoning parser class that should not be used directly. + Provided and methods should be used in derived classes. + + It is used to extract reasoning content from the model output. + """ + + def __init__(self, tokenizer: AnyTokenizer): + self.model_tokenizer = tokenizer + + @cached_property + def vocab(self) -> Dict[str, int]: + # NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab + # whereas all tokenizers have .get_vocab() + return self.model_tokenizer.get_vocab() + + def extract_reasoning_content( + self, model_output: str, request: ChatCompletionRequest + ) -> Tuple[Optional[str], Optional[str]]: + """ + Extract reasoning content from a complete model-generated string. + + Used for non-streaming responses where we have the entire model response + available before sending to the client. + + Parameters: + model_output: str + The model-generated string to extract reasoning content from. + + request: ChatCompletionRequest + The request object that was used to generate the model_output. + + Returns: + Tuple[Optional[str], Optional[str]] + A tuple containing the reasoning content and the content. + """ + + raise NotImplementedError( + "AbstractReasoningParser.extract_reasoning_calls " + "has not been implemented!") + + def extract_reasoning_content_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + ) -> Union[DeltaMessage, None]: + """ + Instance method that should be implemented for extracting reasoning + from an incomplete response; for use when handling reasoning calls and + streaming. Has to be an instance method because it requires state - + the current tokens/diffs, but also the information about what has + previously been parsed and extracted (see constructor) + """ + raise NotImplementedError( + "AbstractReasoningParser.extract_reasoning_content_streaming " + "has not been implemented!") + + +class ReasoningParserManager: + reasoning_parsers: Dict[str, Type] = {} + + @classmethod + def get_reasoning_parser(cls, name) -> Type: + """ + Get reasoning parser by name which is registered by `register_module`. + + Raise a KeyError exception if the name is not registered. + """ + if name in cls.reasoning_parsers: + return cls.reasoning_parsers[name] + + raise KeyError(f"reasoning helper: '{name}' not found in " + "reasoning_parsers") + + @classmethod + def _register_module(cls, + module: Type, + module_name: Optional[Union[str, List[str]]] = None, + force: bool = True) -> None: + if not issubclass(module, ReasoningParser): + raise TypeError("module must be subclass of ReasoningParser, " + f"but got {type(module)}") + if module_name is None: + module_name = module.__name__ + if isinstance(module_name, str): + module_name = [module_name] + for name in module_name: + if not force and name in cls.reasoning_parsers: + existed_module = cls.reasoning_parsers[name] + raise KeyError(f"{name} is already registered " + f"at {existed_module.__module__}") + cls.reasoning_parsers[name] = module + + @classmethod + def register_module( + cls, + name: Optional[Union[str, List[str]]] = None, + force: bool = True, + module: Union[Type, None] = None) -> Union[type, Callable]: + """ + Register module with the given name or name list. it can be used as a + decoder(with module as None) or normal function(with module as not + None). + """ + if not isinstance(force, bool): + raise TypeError(f"force must be a boolean, but got {type(force)}") + + # raise the error ahead of time + if not (name is None or isinstance(name, str) + or is_list_of(name, str)): + raise TypeError( + "name must be None, an instance of str, or a sequence of str, " + f"but got {type(name)}") + + # use it as a normal method: x.register_module(module=SomeClass) + if module is not None: + cls._register_module(module=module, module_name=name, force=force) + return module + + # use it as a decorator: @x.register_module() + def _register(module): + cls._register_module(module=module, module_name=name, force=force) + return module + + return _register + + @classmethod + def import_reasoning_parser(cls, plugin_path: str) -> None: + """ + Import a user-defined reasoning parser by the path + of the reasoning parser define file. + """ + module_name = os.path.splitext(os.path.basename(plugin_path))[0] + + try: + import_from_path(module_name, plugin_path) + except Exception: + logger.exception("Failed to load module '%s' from %s.", + module_name, plugin_path) + return diff --git a/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py b/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py new file mode 100644 index 0000000000000..a440ddc8d3b5d --- /dev/null +++ b/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py @@ -0,0 +1,133 @@ +import re +from typing import Optional, Sequence, Tuple, Union + +from transformers import PreTrainedTokenizerBase + +from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + DeltaMessage) +from vllm.entrypoints.openai.reasoning_parsers.abs_reasoning_parsers import ( + ReasoningParser, ReasoningParserManager) +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +@ReasoningParserManager.register_module("deepseek_r1") +class DeepSeekR1ReasoningParser(ReasoningParser): + """ + Reasoning parser for DeepSeek R1 model. + + The DeepSeek R1 model uses ... tokens to denote reasoning + text. This parser extracts the reasoning content from the model output. + """ + + def __init__(self, tokenizer: PreTrainedTokenizerBase): + super().__init__(tokenizer) + self.think_start_token = "" + self.think_end_token = "" + + self.reasoning_regex = re.compile( + rf"{self.think_start_token}(.*?){self.think_end_token}", re.DOTALL) + + if not self.model_tokenizer: + raise ValueError( + "The model tokenizer must be passed to the ReasoningParser " + "constructor during construction.") + + self.think_start_token_id = self.vocab.get(self.think_start_token) + self.think_end_token_id = self.vocab.get(self.think_end_token) + if (self.think_start_token_id is None + or self.think_end_token_id is None): + raise RuntimeError( + "DeepSeek R1 reasoning parser could not locate think start/end " + "tokens in the tokenizer!") + + def extract_reasoning_content_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + ) -> Union[DeltaMessage, None]: + """ + Extract reasoning content from a delta message. + Handles streaming output where previous + delta = current. + Uses token IDs for faster processing. + For text abcxyz: + - 'abc' goes to reasoning_content + - 'xyz' goes to content + """ + # Skip single special tokens + if len(delta_token_ids) == 1 and (delta_token_ids[0] in [ + self.think_start_token_id, self.think_end_token_id + ]): + return None + + if self.think_start_token_id in previous_token_ids: + if self.think_end_token_id in delta_token_ids: + # in previous, in delta, + # extract reasoning content + end_index = delta_text.find(self.think_end_token) + reasoning_content = delta_text[:end_index] + content = delta_text[end_index + len(self.think_end_token):] + return DeltaMessage(reasoning_content=reasoning_content, + content=content if content else None) + elif self.think_end_token_id in previous_token_ids: + # in previous, in previous, + # reasoning content continues + return DeltaMessage(content=delta_text) + else: + # in previous, no in previous or delta, + # reasoning content continues + return DeltaMessage(reasoning_content=delta_text) + elif self.think_start_token_id in delta_token_ids: + logger.info(delta_text) + if self.think_end_token_id in delta_token_ids: + # in delta, in delta, extract reasoning content + start_index = delta_text.find(self.think_start_token) + end_index = delta_text.find(self.think_end_token) + reasoning_content = delta_text[start_index + + len(self.think_start_token + ):end_index] + content = delta_text[end_index + len(self.think_end_token):] + return DeltaMessage(reasoning_content=reasoning_content, + content=content if content else None) + else: + # in delta, no in delta, + # reasoning content continues + return DeltaMessage(reasoning_content=delta_text) + else: + # No in previous or delta, reasoning content continues. + return DeltaMessage(content=delta_text) + + def extract_reasoning_content( + self, model_output: str, request: ChatCompletionRequest + ) -> Tuple[Optional[str], Optional[str]]: + + # Check if the model output contains the tokens. + if (self.think_start_token not in model_output + or self.think_end_token not in model_output): + return None, model_output + else: + # Use a regex to find the reasoning content + reasoning_content = self.reasoning_regex.findall(model_output)[0] + + # Remove the reasoning content from the model output + # Although deepseek's token is always at the + # beginning of the line, we cannot guarantee that the + # other models will follow this convention. + # Therefore, we need to add :start_index. + start_index = model_output.find(self.think_start_token) + if start_index != -1: + end_index = start_index + len( + f"{self.think_start_token}{reasoning_content}{self.think_end_token}" + ) + model_output = model_output[:start_index] + \ + model_output[end_index:] + + if len(model_output) == 0: + return reasoning_content, None + + return reasoning_content, model_output diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 89a119ac65695..dc97f0eb059d7 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -21,6 +21,8 @@ ChatCompletionStreamResponse, ChatMessage, DeltaFunctionCall, DeltaMessage, DeltaToolCall, ErrorResponse, FunctionCall, PromptTokenUsageInfo, RequestResponseMetadata, ToolCall, UsageInfo) +from vllm.entrypoints.openai.reasoning_parsers import (ReasoningParser, + ReasoningParserManager) from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager @@ -47,6 +49,8 @@ def __init__( chat_template: Optional[str], chat_template_content_format: ChatTemplateContentFormatOption, return_tokens_as_token_ids: bool = False, + enable_reasoning: bool = False, + reasoning_parser: Optional[str] = None, enable_auto_tools: bool = False, tool_parser: Optional[str] = None, enable_prompt_tokens_details: bool = False, @@ -69,6 +73,18 @@ def __init__( " the parallel_tool_calls client option is preset for " "compatibility reasons, it will be ignored.") + self.enable_reasoning: bool = enable_reasoning + self.reasoning_parser: Optional[Callable[[AnyTokenizer], + ReasoningParser]] = None + if self.enable_reasoning: + try: + self.reasoning_parser = ( + ReasoningParserManager.get_reasoning_parser( + reasoning_parser)) + except Exception as e: + raise TypeError("Error: --enable-reasoning requires " + f"reasoning_parser:'{reasoning_parser}' " + "which has not been registered") from e self.tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None if self.enable_auto_tools: try: @@ -285,14 +301,35 @@ async def chat_completion_stream_generator( not tool_choice_function_name and self._should_stream_with_auto_tool_parsing(request)) + should_stream_with_reasoning_parsing = ( + self._should_stream_with_reasoning_parsing(request)) + all_previous_token_ids: Optional[List[List[int]]] - if tool_choice_auto: + + # Only one of these will be used, thus previous_texts and + # all_previous_token_ids will not be used twice in the same iteration. + if tool_choice_auto or should_stream_with_reasoning_parsing: # These are only required in "auto" tool choice case previous_texts = [""] * num_choices all_previous_token_ids = [[]] * num_choices else: previous_texts, all_previous_token_ids = None, None + try: + # There is no need to check if the reasoning_parser is None + # because the should_stream_with_reasoning_parsing check + # already ensures that the reasoning_parser is not None. + # but the pre-commit hook requires it. + if should_stream_with_reasoning_parsing and \ + self.reasoning_parser is not None: + reasoning_parser = self.reasoning_parser(tokenizer) + except RuntimeError as e: + logger.exception("Error in reasoning parser creation.") + data = self.create_streaming_error_response(str(e)) + yield f"data: {data}\n\n" + yield "data: [DONE]\n\n" + return + # Prepare the tool parser if it's needed try: if tool_choice_auto and self.tool_parser: @@ -456,6 +493,32 @@ async def chat_completion_stream_generator( # update the previous values for the next iteration previous_texts[i] = current_text all_previous_token_ids[i] = current_token_ids + # reasoning_content cannot be enabled with tool_choice. + # If it is, the tool_choice will be used instead. + elif self.enable_reasoning: + # handle reasoning_content delta + assert reasoning_parser is not None + assert previous_texts is not None + assert all_previous_token_ids is not None + previous_text = previous_texts[i] + previous_token_ids = all_previous_token_ids[i] + current_text = previous_text + delta_text + current_token_ids = previous_token_ids + list( + output.token_ids) + + delta_message = (reasoning_parser. + extract_reasoning_content_streaming( + previous_text, + current_text, + delta_text, + previous_token_ids, + current_token_ids, + output.token_ids, + )) + + # update the previous values for the next iteration + previous_texts[i] = current_text + all_previous_token_ids[i] = current_token_ids # handle streaming just a content delta else: @@ -642,17 +705,38 @@ async def chat_completion_full_generator( else: logprobs = None + should_stream_with_reasoning_parsing = ( + self._should_stream_with_reasoning_parsing(request)) + # In the OpenAI API the finish_reason is "tools_called" # if the tool choice is auto and the model produced a tool # call. The same is not true for named function calls auto_tools_called = False + if should_stream_with_reasoning_parsing and \ + self.reasoning_parser is not None: + try: + reasoning_parser = self.reasoning_parser(tokenizer) + except RuntimeError as e: + logger.exception("Error in reasoning parser creation.") + return self.create_error_response(str(e)) + + reasoning_content, content = ( + reasoning_parser.extract_reasoning_content( + output.text, request=request)) + + if reasoning_content: + message = ChatMessage(role=role, + content=content, + reasoning_content=reasoning_content) + else: + message = ChatMessage(role=role, content=output.text) + # if auto tools are not enabled, and a named tool choice using # outlines is not being used - if (not self.enable_auto_tools - or not self.tool_parser) and not isinstance( - request.tool_choice, - ChatCompletionNamedToolChoiceParam): + elif (not self.enable_auto_tools + or not self.tool_parser) and not isinstance( + request.tool_choice, ChatCompletionNamedToolChoiceParam): message = ChatMessage(role=role, content=output.text) # if the request uses tools and specified a tool choice @@ -835,6 +919,17 @@ def _should_stream_with_auto_tool_parsing(self, return (request.tools and self.tool_parser and self.enable_auto_tools and request.tool_choice in ['auto', None]) + def _should_stream_with_reasoning_parsing(self, + request: ChatCompletionRequest): + """ + Utility function to check if streamed tokens should go through the + reasoning parser that was configured. + + We only want to do this IF reasoning is enabled and a reasoning + parser is configured. + """ + return self.enable_reasoning and self.reasoning_parser is not None + def _should_check_for_unstreamed_tool_arg_tokens( self, delta_message: Optional[DeltaMessage], diff --git a/vllm/scripts.py b/vllm/scripts.py index 42e1c639eda10..8101e6b3af7ee 100644 --- a/vllm/scripts.py +++ b/vllm/scripts.py @@ -167,6 +167,7 @@ def main(): "Must be a YAML with the following options:" "https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#cli-reference" ) + serve_parser = make_arg_parser(serve_parser) serve_parser.set_defaults(dispatch_function=serve) From dd6a3a02cb3bf2a7bc6cb84c85dcd57c6eaf2bf9 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 29 Jan 2025 03:38:29 +0000 Subject: [PATCH 27/69] [Doc] Convert docs to use colon fences (#12471) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/requirements-docs.txt | 4 +- docs/source/api/engine/index.md | 4 +- docs/source/api/model/index.md | 4 +- docs/source/api/multimodal/index.md | 4 +- docs/source/api/offline_inference/index.md | 4 +- .../contributing/dockerfile/dockerfile.md | 4 +- docs/source/contributing/model/basic.md | 8 +- docs/source/contributing/model/index.md | 12 +- docs/source/contributing/model/multimodal.md | 32 +- .../source/contributing/model/registration.md | 16 +- docs/source/contributing/model/tests.md | 8 +- docs/source/contributing/overview.md | 12 +- .../contributing/profiling/profiling_index.md | 12 +- docs/source/deployment/docker.md | 16 +- .../source/deployment/frameworks/cerebrium.md | 4 +- docs/source/deployment/frameworks/dstack.md | 8 +- docs/source/deployment/frameworks/helm.md | 408 +++--- docs/source/deployment/frameworks/index.md | 4 +- docs/source/deployment/frameworks/skypilot.md | 36 +- docs/source/deployment/integrations/index.md | 4 +- docs/source/deployment/nginx.md | 4 +- docs/source/design/arch_overview.md | 20 +- docs/source/design/kernel/paged_attention.md | 32 +- docs/source/design/multiprocessing.md | 4 +- .../features/automatic_prefix_caching.md | 4 +- docs/source/features/compatibility_matrix.md | 879 ++++++------ docs/source/features/disagg_prefill.md | 20 +- docs/source/features/lora.md | 4 +- docs/source/features/quantization/auto_awq.md | 4 +- docs/source/features/quantization/fp8.md | 16 +- docs/source/features/quantization/gguf.md | 12 +- docs/source/features/quantization/index.md | 4 +- docs/source/features/quantization/int8.md | 8 +- .../quantization/supported_hardware.md | 229 +-- docs/source/features/spec_decode.md | 8 +- docs/source/features/structured_outputs.md | 4 +- docs/source/generate_examples.py | 4 +- .../ai_accelerator/hpu-gaudi.inc.md | 76 +- .../installation/ai_accelerator/index.md | 272 ++-- .../installation/ai_accelerator/neuron.inc.md | 4 +- .../installation/ai_accelerator/tpu.inc.md | 51 +- .../installation/cpu/apple.inc.md | 4 +- .../getting_started/installation/cpu/index.md | 86 +- .../installation/cpu/x86.inc.md | 4 +- .../installation/gpu/cuda.inc.md | 12 +- .../getting_started/installation/gpu/index.md | 206 +-- .../installation/gpu/rocm.inc.md | 25 +- .../installation/gpu/xpu.inc.md | 4 +- .../getting_started/installation/index.md | 4 +- .../installation/python_env_setup.inc.md | 4 +- docs/source/getting_started/quickstart.md | 12 +- .../source/getting_started/troubleshooting.md | 12 +- docs/source/index.md | 48 +- docs/source/models/extensions/index.md | 4 +- .../models/extensions/runai_model_streamer.md | 4 +- docs/source/models/extensions/tensorizer.md | 4 +- docs/source/models/generative_models.md | 4 +- docs/source/models/pooling_models.md | 60 +- docs/source/models/supported_models.md | 1263 +++++++++-------- docs/source/serving/distributed_serving.md | 12 +- docs/source/serving/engine_args.md | 2 + docs/source/serving/env_vars.md | 8 +- docs/source/serving/integrations/index.md | 4 +- docs/source/serving/metrics.md | 4 +- docs/source/serving/multimodal_inputs.md | 53 +- docs/source/serving/offline_inference.md | 8 +- .../serving/openai_compatible_server.md | 56 +- pyproject.toml | 1 + 68 files changed, 2091 insertions(+), 2080 deletions(-) diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index 8217bc3ba3ded..1d669699f4b2a 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -1,10 +1,10 @@ sphinx==6.2.1 +sphinx-argparse==0.4.0 sphinx-book-theme==1.0.1 sphinx-copybutton==0.5.2 -myst-parser==3.0.1 -sphinx-argparse==0.4.0 sphinx-design==0.6.1 sphinx-togglebutton==0.3.2 +myst-parser==3.0.1 msgspec cloudpickle diff --git a/docs/source/api/engine/index.md b/docs/source/api/engine/index.md index 701cb95d3be33..b6544d94afdf8 100644 --- a/docs/source/api/engine/index.md +++ b/docs/source/api/engine/index.md @@ -8,10 +8,10 @@ .. currentmodule:: vllm.engine ``` -```{toctree} +:::{toctree} :caption: Engines :maxdepth: 2 llm_engine async_llm_engine -``` +::: diff --git a/docs/source/api/model/index.md b/docs/source/api/model/index.md index 113792147be7c..8fee3a55c93de 100644 --- a/docs/source/api/model/index.md +++ b/docs/source/api/model/index.md @@ -2,10 +2,10 @@ ## Submodules -```{toctree} +:::{toctree} :maxdepth: 1 interfaces_base interfaces adapters -``` +::: diff --git a/docs/source/api/multimodal/index.md b/docs/source/api/multimodal/index.md index 14efdb506d76f..069ed53e545c5 100644 --- a/docs/source/api/multimodal/index.md +++ b/docs/source/api/multimodal/index.md @@ -17,7 +17,7 @@ Looking to add your own multi-modal model? Please follow the instructions listed ## Submodules -```{toctree} +:::{toctree} :maxdepth: 1 inputs @@ -25,4 +25,4 @@ parse processing profiling registry -``` +::: diff --git a/docs/source/api/offline_inference/index.md b/docs/source/api/offline_inference/index.md index c32f99d59e3db..ec2cc599d923c 100644 --- a/docs/source/api/offline_inference/index.md +++ b/docs/source/api/offline_inference/index.md @@ -1,9 +1,9 @@ # Offline Inference -```{toctree} +:::{toctree} :caption: Contents :maxdepth: 1 llm llm_inputs -``` +::: diff --git a/docs/source/contributing/dockerfile/dockerfile.md b/docs/source/contributing/dockerfile/dockerfile.md index cb142318b8724..96674805df534 100644 --- a/docs/source/contributing/dockerfile/dockerfile.md +++ b/docs/source/contributing/dockerfile/dockerfile.md @@ -17,11 +17,11 @@ The edges of the build graph represent: - `RUN --mount=(.\*)from=...` dependencies (with a dotted line and an empty diamond arrow head) - > ```{figure} /assets/contributing/dockerfile-stages-dependency.png + > :::{figure} /assets/contributing/dockerfile-stages-dependency.png > :align: center > :alt: query > :width: 100% - > ``` + > ::: > > Made using: > diff --git a/docs/source/contributing/model/basic.md b/docs/source/contributing/model/basic.md index b9b92fd027f6e..180fdd59e9a64 100644 --- a/docs/source/contributing/model/basic.md +++ b/docs/source/contributing/model/basic.md @@ -10,9 +10,9 @@ First, clone the PyTorch model code from the source repository. For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file. -```{warning} +:::{warning} Make sure to review and adhere to the original code's copyright and licensing terms! -``` +::: ## 2. Make your code compatible with vLLM @@ -80,10 +80,10 @@ def forward( ... ``` -```{note} +:::{note} Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings. If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM. -``` +::: For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out for more examples. diff --git a/docs/source/contributing/model/index.md b/docs/source/contributing/model/index.md index fe018b61b08cf..721ee3cd2047c 100644 --- a/docs/source/contributing/model/index.md +++ b/docs/source/contributing/model/index.md @@ -4,7 +4,7 @@ This section provides more information on how to integrate a [PyTorch](https://pytorch.org/) model into vLLM. -```{toctree} +:::{toctree} :caption: Contents :maxdepth: 1 @@ -12,16 +12,16 @@ basic registration tests multimodal -``` +::: -```{note} +:::{note} The complexity of adding a new model depends heavily on the model's architecture. The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM. However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex. -``` +::: -```{tip} +:::{tip} If you are encountering issues while integrating your model into vLLM, feel free to open a [GitHub issue](https://github.com/vllm-project/vllm/issues) or ask on our [developer slack](https://slack.vllm.ai). We will be happy to help you out! -``` +::: diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md index e5fd9a2877ceb..6c6f3b701cd28 100644 --- a/docs/source/contributing/model/multimodal.md +++ b/docs/source/contributing/model/multimodal.md @@ -48,9 +48,9 @@ Further update the model as follows: return vision_embeddings ``` - ```{important} + :::{important} The returned `multimodal_embeddings` must be either a **3D {class}`torch.Tensor`** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D {class}`torch.Tensor`'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request. - ``` + ::: - Implement {meth}`~vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings` to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings. @@ -89,10 +89,10 @@ Further update the model as follows: + class YourModelForImage2Seq(nn.Module, SupportsMultiModal): ``` - ```{note} + :::{note} The model class does not have to be named {code}`*ForCausalLM`. Check out [the HuggingFace Transformers documentation](https://huggingface.co/docs/transformers/model_doc/auto#multimodal) for some examples. - ``` + ::: ## 2. Specify processing information @@ -120,8 +120,8 @@ When calling the model, the output embeddings from the visual encoder are assign containing placeholder feature tokens. Therefore, the number of placeholder feature tokens should be equal to the size of the output embeddings. -::::{tab-set} -:::{tab-item} Basic example: LLaVA +:::::{tab-set} +::::{tab-item} Basic example: LLaVA :sync: llava Looking at the code of HF's `LlavaForConditionalGeneration`: @@ -254,12 +254,12 @@ def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: return {"image": self.get_max_image_tokens()} ``` -```{note} +:::{note} Our [actual code](gh-file:vllm/model_executor/models/llava.py) is more abstracted to support vision encoders other than CLIP. -``` - ::: + :::: +::::: ## 3. Specify dummy inputs @@ -315,17 +315,17 @@ def get_dummy_processor_inputs( Afterwards, create a subclass of {class}`~vllm.multimodal.processing.BaseMultiModalProcessor` to fill in the missing details about HF processing. -```{seealso} +:::{seealso} [Multi-Modal Data Processing](#mm-processing) -``` +::: ### Multi-modal fields Override {class}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config` to return a schema of the tensors outputted by the HF processor that are related to the input multi-modal items. -::::{tab-set} -:::{tab-item} Basic example: LLaVA +:::::{tab-set} +::::{tab-item} Basic example: LLaVA :sync: llava Looking at the model's `forward` method: @@ -367,13 +367,13 @@ def _get_mm_fields_config( ) ``` -```{note} +:::{note} Our [actual code](gh-file:vllm/model_executor/models/llava.py) additionally supports pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument. -``` - ::: + :::: +::::: ### Prompt replacements diff --git a/docs/source/contributing/model/registration.md b/docs/source/contributing/model/registration.md index d6c9e4181dfee..64cd25b53807e 100644 --- a/docs/source/contributing/model/registration.md +++ b/docs/source/contributing/model/registration.md @@ -17,17 +17,17 @@ After you have implemented your model (see [tutorial](#new-model-basic)), put it Then, add your model class to `_VLLM_MODELS` in so that it is automatically registered upon importing vLLM. Finally, update our [list of supported models](#supported-models) to promote your model! -```{important} +:::{important} The list of models in each section should be maintained in alphabetical order. -``` +::: ## Out-of-tree models You can load an external model using a plugin without modifying the vLLM codebase. -```{seealso} +:::{seealso} [vLLM's Plugin System](#plugin-system) -``` +::: To register the model, use the following code: @@ -45,11 +45,11 @@ from vllm import ModelRegistry ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM") ``` -```{important} +:::{important} If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. Read more about that [here](#supports-multimodal). -``` +::: -```{note} +:::{note} Although you can directly put these code snippets in your script using `vllm.LLM`, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server. -``` +::: diff --git a/docs/source/contributing/model/tests.md b/docs/source/contributing/model/tests.md index 74c933b2f45da..68d51d89f7cff 100644 --- a/docs/source/contributing/model/tests.md +++ b/docs/source/contributing/model/tests.md @@ -14,14 +14,14 @@ Without them, the CI for your PR will fail. Include an example HuggingFace repository for your model in . This enables a unit test that loads dummy weights to ensure that the model can be initialized in vLLM. -```{important} +:::{important} The list of models in each section should be maintained in alphabetical order. -``` +::: -```{tip} +:::{tip} If your model requires a development version of HF Transformers, you can set `min_transformers_version` to skip the test in CI until the model is released. -``` +::: ## Optional Tests diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md index 36cf8e7440eca..908c7cb4d38ee 100644 --- a/docs/source/contributing/overview.md +++ b/docs/source/contributing/overview.md @@ -35,17 +35,17 @@ pre-commit run --all-files pytest tests/ ``` -```{note} +:::{note} Currently, the repository is not fully checked by `mypy`. -``` +::: ## Issues If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. -```{important} +:::{important} If you discover a security vulnerability, please follow the instructions [here](gh-file:SECURITY.md#reporting-a-vulnerability). -``` +::: ## Pull Requests & Code Reviews @@ -81,9 +81,9 @@ appropriately to indicate the type of change. Please use one of the following: - `[Misc]` for PRs that do not fit the above categories. Please use this sparingly. -```{note} +:::{note} If the PR spans more than one category, please include all relevant prefixes. -``` +::: ### Code Quality diff --git a/docs/source/contributing/profiling/profiling_index.md b/docs/source/contributing/profiling/profiling_index.md index 001db86bdf555..79aeb292a9b73 100644 --- a/docs/source/contributing/profiling/profiling_index.md +++ b/docs/source/contributing/profiling/profiling_index.md @@ -6,21 +6,21 @@ The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` en When using `benchmarks/benchmark_serving.py`, you can enable profiling by passing the `--profile` flag. -```{warning} +:::{warning} Only enable profiling in a development environment. -``` +::: Traces can be visualized using . -```{tip} +:::{tip} Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly. -``` +::: -```{tip} +:::{tip} To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100. Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes. `export VLLM_RPC_TIMEOUT=1800000` -``` +::: ## Example commands and usage diff --git a/docs/source/deployment/docker.md b/docs/source/deployment/docker.md index 438be47316f3b..334c02225bd6b 100644 --- a/docs/source/deployment/docker.md +++ b/docs/source/deployment/docker.md @@ -21,11 +21,11 @@ $ docker run --runtime nvidia --gpus all \ You can add any other you need after the image tag (`vllm/vllm-openai:latest`). -```{note} +:::{note} You can either use the `ipc=host` flag or `--shm-size` flag to allow the container to access the host's shared memory. vLLM uses PyTorch, which uses shared memory to share data between processes under the hood, particularly for tensor parallel inference. -``` +::: (deployment-docker-build-image-from-source)= @@ -38,25 +38,25 @@ You can build and run vLLM from source via the provided . To DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai ``` -```{note} +:::{note} By default vLLM will build for all GPU types for widest distribution. If you are just building for the current GPU type the machine is running on, you can add the argument `--build-arg torch_cuda_arch_list=""` for vLLM to find the current GPU type and build for that. If you are using Podman instead of Docker, you might need to disable SELinux labeling by adding `--security-opt label=disable` when running `podman build` command to avoid certain [existing issues](https://github.com/containers/buildah/discussions/4184). -``` +::: ## Building for Arm64/aarch64 A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64. -```{note} +:::{note} Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=` flags to speed up build process. However, ensure your `max_jobs` is substantially larger than `nvcc_threads` to get the most benefits. Keep an eye on memory usage with parallel jobs as it can be substantial (see example below). -``` +::: ```console # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB) @@ -85,6 +85,6 @@ $ docker run --runtime nvidia --gpus all \ The argument `vllm/vllm-openai` specifies the image to run, and should be replaced with the name of the custom-built image (the `-t` tag from the build command). -```{note} +:::{note} **For version 0.4.1 and 0.4.2 only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. `/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable `VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` . -``` +::: diff --git a/docs/source/deployment/frameworks/cerebrium.md b/docs/source/deployment/frameworks/cerebrium.md index 5787c4a407bfb..b20c95137b6e7 100644 --- a/docs/source/deployment/frameworks/cerebrium.md +++ b/docs/source/deployment/frameworks/cerebrium.md @@ -2,11 +2,11 @@ # Cerebrium -```{raw} html +:::{raw} html

vLLM_plus_cerebrium

-``` +::: vLLM can be run on a cloud based GPU machine with [Cerebrium](https://www.cerebrium.ai/), a serverless AI infrastructure platform that makes it easier for companies to build and deploy AI based applications. diff --git a/docs/source/deployment/frameworks/dstack.md b/docs/source/deployment/frameworks/dstack.md index b42a34125c6d7..a16e28f2d8983 100644 --- a/docs/source/deployment/frameworks/dstack.md +++ b/docs/source/deployment/frameworks/dstack.md @@ -2,11 +2,11 @@ # dstack -```{raw} html +:::{raw} html

vLLM_plus_dstack

-``` +::: vLLM can be run on a cloud based GPU machine with [dstack](https://dstack.ai/), an open-source framework for running LLMs on any cloud. This tutorial assumes that you have already configured credentials, gateway, and GPU quotas on your cloud environment. @@ -97,6 +97,6 @@ completion = client.chat.completions.create( print(completion.choices[0].message.content) ``` -```{note} +:::{note} dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out [this repository](https://github.com/dstackai/dstack-examples/tree/main/deployment/vllm) -``` +::: diff --git a/docs/source/deployment/frameworks/helm.md b/docs/source/deployment/frameworks/helm.md index 18ed293191468..e4fc5e1313079 100644 --- a/docs/source/deployment/frameworks/helm.md +++ b/docs/source/deployment/frameworks/helm.md @@ -38,213 +38,213 @@ chart **including persistent volumes** and deletes the release. ## Architecture -```{image} /assets/deployment/architecture_helm_deployment.png -``` +:::{image} /assets/deployment/architecture_helm_deployment.png +::: ## Values -```{list-table} +:::{list-table} :widths: 25 25 25 25 :header-rows: 1 -* - Key - - Type - - Default - - Description -* - autoscaling - - object - - {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80} - - Autoscaling configuration -* - autoscaling.enabled - - bool - - false - - Enable autoscaling -* - autoscaling.maxReplicas - - int - - 100 - - Maximum replicas -* - autoscaling.minReplicas - - int - - 1 - - Minimum replicas -* - autoscaling.targetCPUUtilizationPercentage - - int - - 80 - - Target CPU utilization for autoscaling -* - configs - - object - - {} - - Configmap -* - containerPort - - int - - 8000 - - Container port -* - customObjects - - list - - [] - - Custom Objects configuration -* - deploymentStrategy - - object - - {} - - Deployment strategy configuration -* - externalConfigs - - list - - [] - - External configuration -* - extraContainers - - list - - [] - - Additional containers configuration -* - extraInit - - object - - {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true} - - Additional configuration for the init container -* - extraInit.pvcStorage - - string - - "50Gi" - - Storage size of the s3 -* - extraInit.s3modelpath - - string - - "relative_s3_model_path/opt-125m" - - Path of the model on the s3 which hosts model weights and config files -* - extraInit.awsEc2MetadataDisabled - - boolean - - true - - Disables the use of the Amazon EC2 instance metadata service -* - extraPorts - - list - - [] - - Additional ports configuration -* - gpuModels - - list - - ["TYPE_GPU_USED"] - - Type of gpu used -* - image - - object - - {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"} - - Image configuration -* - image.command - - list - - ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"] - - Container launch command -* - image.repository - - string - - "vllm/vllm-openai" - - Image repository -* - image.tag - - string - - "latest" - - Image tag -* - livenessProbe - - object - - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10} - - Liveness probe configuration -* - livenessProbe.failureThreshold - - int - - 3 - - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive -* - livenessProbe.httpGet - - object - - {"path":"/health","port":8000} - - Configuration of the Kubelet http request on the server -* - livenessProbe.httpGet.path - - string - - "/health" - - Path to access on the HTTP server -* - livenessProbe.httpGet.port - - int - - 8000 - - Name or number of the port to access on the container, on which the server is listening -* - livenessProbe.initialDelaySeconds - - int - - 15 - - Number of seconds after the container has started before liveness probe is initiated -* - livenessProbe.periodSeconds - - int - - 10 - - How often (in seconds) to perform the liveness probe -* - maxUnavailablePodDisruptionBudget - - string - - "" - - Disruption Budget Configuration -* - readinessProbe - - object - - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5} - - Readiness probe configuration -* - readinessProbe.failureThreshold - - int - - 3 - - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready -* - readinessProbe.httpGet - - object - - {"path":"/health","port":8000} - - Configuration of the Kubelet http request on the server -* - readinessProbe.httpGet.path - - string - - "/health" - - Path to access on the HTTP server -* - readinessProbe.httpGet.port - - int - - 8000 - - Name or number of the port to access on the container, on which the server is listening -* - readinessProbe.initialDelaySeconds - - int - - 5 - - Number of seconds after the container has started before readiness probe is initiated -* - readinessProbe.periodSeconds - - int - - 5 - - How often (in seconds) to perform the readiness probe -* - replicaCount - - int - - 1 - - Number of replicas -* - resources - - object - - {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}} - - Resource configuration -* - resources.limits."nvidia.com/gpu" - - int - - 1 - - Number of gpus used -* - resources.limits.cpu - - int - - 4 - - Number of CPUs -* - resources.limits.memory - - string - - "16Gi" - - CPU memory configuration -* - resources.requests."nvidia.com/gpu" - - int - - 1 - - Number of gpus used -* - resources.requests.cpu - - int - - 4 - - Number of CPUs -* - resources.requests.memory - - string - - "16Gi" - - CPU memory configuration -* - secrets - - object - - {} - - Secrets configuration -* - serviceName - - string - - - - Service name -* - servicePort - - int - - 80 - - Service port -* - labels.environment - - string - - test - - Environment name -* - labels.release - - string - - test - - Release name -``` +- * Key + * Type + * Default + * Description +- * autoscaling + * object + * {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80} + * Autoscaling configuration +- * autoscaling.enabled + * bool + * false + * Enable autoscaling +- * autoscaling.maxReplicas + * int + * 100 + * Maximum replicas +- * autoscaling.minReplicas + * int + * 1 + * Minimum replicas +- * autoscaling.targetCPUUtilizationPercentage + * int + * 80 + * Target CPU utilization for autoscaling +- * configs + * object + * {} + * Configmap +- * containerPort + * int + * 8000 + * Container port +- * customObjects + * list + * [] + * Custom Objects configuration +- * deploymentStrategy + * object + * {} + * Deployment strategy configuration +- * externalConfigs + * list + * [] + * External configuration +- * extraContainers + * list + * [] + * Additional containers configuration +- * extraInit + * object + * {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true} + * Additional configuration for the init container +- * extraInit.pvcStorage + * string + * "50Gi" + * Storage size of the s3 +- * extraInit.s3modelpath + * string + * "relative_s3_model_path/opt-125m" + * Path of the model on the s3 which hosts model weights and config files +- * extraInit.awsEc2MetadataDisabled + * boolean + * true + * Disables the use of the Amazon EC2 instance metadata service +- * extraPorts + * list + * [] + * Additional ports configuration +- * gpuModels + * list + * ["TYPE_GPU_USED"] + * Type of gpu used +- * image + * object + * {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"} + * Image configuration +- * image.command + * list + * ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"] + * Container launch command +- * image.repository + * string + * "vllm/vllm-openai" + * Image repository +- * image.tag + * string + * "latest" + * Image tag +- * livenessProbe + * object + * {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10} + * Liveness probe configuration +- * livenessProbe.failureThreshold + * int + * 3 + * Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive +- * livenessProbe.httpGet + * object + * {"path":"/health","port":8000} + * Configuration of the Kubelet http request on the server +- * livenessProbe.httpGet.path + * string + * "/health" + * Path to access on the HTTP server +- * livenessProbe.httpGet.port + * int + * 8000 + * Name or number of the port to access on the container, on which the server is listening +- * livenessProbe.initialDelaySeconds + * int + * 15 + * Number of seconds after the container has started before liveness probe is initiated +- * livenessProbe.periodSeconds + * int + * 10 + * How often (in seconds) to perform the liveness probe +- * maxUnavailablePodDisruptionBudget + * string + * "" + * Disruption Budget Configuration +- * readinessProbe + * object + * {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5} + * Readiness probe configuration +- * readinessProbe.failureThreshold + * int + * 3 + * Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready +- * readinessProbe.httpGet + * object + * {"path":"/health","port":8000} + * Configuration of the Kubelet http request on the server +- * readinessProbe.httpGet.path + * string + * "/health" + * Path to access on the HTTP server +- * readinessProbe.httpGet.port + * int + * 8000 + * Name or number of the port to access on the container, on which the server is listening +- * readinessProbe.initialDelaySeconds + * int + * 5 + * Number of seconds after the container has started before readiness probe is initiated +- * readinessProbe.periodSeconds + * int + * 5 + * How often (in seconds) to perform the readiness probe +- * replicaCount + * int + * 1 + * Number of replicas +- * resources + * object + * {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}} + * Resource configuration +- * resources.limits."nvidia.com/gpu" + * int + * 1 + * Number of gpus used +- * resources.limits.cpu + * int + * 4 + * Number of CPUs +- * resources.limits.memory + * string + * "16Gi" + * CPU memory configuration +- * resources.requests."nvidia.com/gpu" + * int + * 1 + * Number of gpus used +- * resources.requests.cpu + * int + * 4 + * Number of CPUs +- * resources.requests.memory + * string + * "16Gi" + * CPU memory configuration +- * secrets + * object + * {} + * Secrets configuration +- * serviceName + * string + * + * Service name +- * servicePort + * int + * 80 + * Service port +- * labels.environment + * string + * test + * Environment name +- * labels.release + * string + * test + * Release name +::: diff --git a/docs/source/deployment/frameworks/index.md b/docs/source/deployment/frameworks/index.md index 964782763f6b3..cb758d3e6d2e4 100644 --- a/docs/source/deployment/frameworks/index.md +++ b/docs/source/deployment/frameworks/index.md @@ -1,6 +1,6 @@ # Using other frameworks -```{toctree} +:::{toctree} :maxdepth: 1 bentoml @@ -11,4 +11,4 @@ lws modal skypilot triton -``` +::: diff --git a/docs/source/deployment/frameworks/skypilot.md b/docs/source/deployment/frameworks/skypilot.md index 051fc2f2a8d4e..5e101b9001033 100644 --- a/docs/source/deployment/frameworks/skypilot.md +++ b/docs/source/deployment/frameworks/skypilot.md @@ -2,11 +2,11 @@ # SkyPilot -```{raw} html +:::{raw} html

vLLM

-``` +::: vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with [SkyPilot](https://github.com/skypilot-org/skypilot), an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in [SkyPilot AI gallery](https://skypilot.readthedocs.io/en/latest/gallery/index.html). @@ -104,10 +104,10 @@ service: max_completion_tokens: 1 ``` -```{raw} html +:::{raw} html
Click to see the full recipe YAML -``` +::: ```yaml service: @@ -153,9 +153,9 @@ run: | 2>&1 | tee api_server.log ``` -```{raw} html +:::{raw} html
-``` +::: Start the serving the Llama-3 8B model on multiple replicas: @@ -169,10 +169,10 @@ Wait until the service is ready: watch -n10 sky serve status vllm ``` -```{raw} html +:::{raw} html
Example outputs: -``` +::: ```console Services @@ -185,9 +185,9 @@ vllm 1 1 xx.yy.zz.121 18 mins ago 1x GCP([Spot]{'L4': 1}) R vllm 2 1 xx.yy.zz.245 18 mins ago 1x GCP([Spot]{'L4': 1}) READY us-east4 ``` -```{raw} html +:::{raw} html
-``` +::: After the service is READY, you can find a single endpoint for the service and access the service with the endpoint: @@ -223,10 +223,10 @@ service: This will scale the service up to when the QPS exceeds 2 for each replica. -```{raw} html +:::{raw} html
Click to see the full recipe YAML -``` +::: ```yaml service: @@ -275,9 +275,9 @@ run: | 2>&1 | tee api_server.log ``` -```{raw} html +:::{raw} html
-``` +::: To update the service with the new config: @@ -295,10 +295,10 @@ sky serve down vllm It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas. -```{raw} html +:::{raw} html
Click to see the full GUI YAML -``` +::: ```yaml envs: @@ -328,9 +328,9 @@ run: | --stop-token-ids 128009,128001 | tee ~/gradio.log ``` -```{raw} html +:::{raw} html
-``` +::: 1. Start the chat web UI: diff --git a/docs/source/deployment/integrations/index.md b/docs/source/deployment/integrations/index.md index d47ede8967547..c286edb4d7bc1 100644 --- a/docs/source/deployment/integrations/index.md +++ b/docs/source/deployment/integrations/index.md @@ -1,9 +1,9 @@ # External Integrations -```{toctree} +:::{toctree} :maxdepth: 1 kserve kubeai llamastack -``` +::: diff --git a/docs/source/deployment/nginx.md b/docs/source/deployment/nginx.md index a58f791c2997b..87feb48856853 100644 --- a/docs/source/deployment/nginx.md +++ b/docs/source/deployment/nginx.md @@ -105,9 +105,9 @@ docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-si docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8082:8000 --name vllm1 vllm --model meta-llama/Llama-2-7b-chat-hf ``` -```{note} +:::{note} If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`. -``` +::: (nginxloadbalancer-nginx-launch-nginx)= diff --git a/docs/source/design/arch_overview.md b/docs/source/design/arch_overview.md index cec503ef2f77d..04886e5981eef 100644 --- a/docs/source/design/arch_overview.md +++ b/docs/source/design/arch_overview.md @@ -4,19 +4,19 @@ This document provides an overview of the vLLM architecture. -```{contents} Table of Contents +:::{contents} Table of Contents :depth: 2 :local: true -``` +::: ## Entrypoints vLLM provides a number of entrypoints for interacting with the system. The following diagram shows the relationship between them. -```{image} /assets/design/arch_overview/entrypoints.excalidraw.png +:::{image} /assets/design/arch_overview/entrypoints.excalidraw.png :alt: Entrypoints Diagram -``` +::: ### LLM Class @@ -84,9 +84,9 @@ More details on the API server can be found in the [OpenAI-Compatible Server](#o The `LLMEngine` and `AsyncLLMEngine` classes are central to the functioning of the vLLM system, handling model inference and asynchronous request processing. -```{image} /assets/design/arch_overview/llm_engine.excalidraw.png +:::{image} /assets/design/arch_overview/llm_engine.excalidraw.png :alt: LLMEngine Diagram -``` +::: ### LLMEngine @@ -144,11 +144,11 @@ configurations affect the class we ultimately get. The following figure shows the class hierarchy of vLLM: -> ```{figure} /assets/design/hierarchy.png +> :::{figure} /assets/design/hierarchy.png > :align: center > :alt: query > :width: 100% -> ``` +> ::: There are several important design choices behind this class hierarchy: @@ -178,7 +178,7 @@ of a vision model and a language model. By making the constructor uniform, we can easily create a vision model and a language model and compose them into a vision-language model. -````{note} +:::{note} To support this change, all vLLM models' signatures have been updated to: ```python @@ -215,7 +215,7 @@ else: ``` This way, the model can work with both old and new versions of vLLM. -```` +::: 3\. **Sharding and Quantization at Initialization**: Certain features require changing the model weights. For example, tensor parallelism needs to shard the diff --git a/docs/source/design/kernel/paged_attention.md b/docs/source/design/kernel/paged_attention.md index f896f903c78f5..5f2582877260a 100644 --- a/docs/source/design/kernel/paged_attention.md +++ b/docs/source/design/kernel/paged_attention.md @@ -139,26 +139,26 @@ const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE; ``` - ```{figure} ../../assets/kernel/query.png + :::{figure} ../../assets/kernel/query.png :align: center :alt: query :width: 70% Query data of one token at one head - ``` + ::: - Each thread defines its own `q_ptr` which points to the assigned query token data on global memory. For example, if `VEC_SIZE` is 4 and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains total of 128 elements divided into 128 / 4 = 32 vecs. - ```{figure} ../../assets/kernel/q_vecs.png + :::{figure} ../../assets/kernel/q_vecs.png :align: center :alt: q_vecs :width: 70% `q_vecs` for one thread group - ``` + ::: ```cpp __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD]; @@ -195,13 +195,13 @@ points to key token data based on `k_cache` at assigned block, assigned head and assigned token. - ```{figure} ../../assets/kernel/key.png + :::{figure} ../../assets/kernel/key.png :align: center :alt: key :width: 70% Key data of all context tokens at one head - ``` + ::: - The diagram above illustrates the memory layout for key data. It assumes that the `BLOCK_SIZE` is 16, `HEAD_SIZE` is 128, `x` is @@ -214,13 +214,13 @@ elements for one token) that will be processed by 2 threads (one thread group) separately. - ```{figure} ../../assets/kernel/k_vecs.png + :::{figure} ../../assets/kernel/k_vecs.png :align: center :alt: k_vecs :width: 70% `k_vecs` for one thread - ``` + ::: ```cpp K_vec k_vecs[NUM_VECS_PER_THREAD] @@ -289,14 +289,14 @@ should be performed across the entire thread block, encompassing results between the query token and all context key tokens. - ```{math} + :::{math} :nowrap: true \begin{gather*} m(x):=\max _i \quad x_i \\ \quad f(x):=\left[\begin{array}{lll}e^{x_1-m(x)} & \ldots & e^{x_B-m(x)}\end{array}\right]\\ \quad \ell(x):=\sum_i f(x)_i \\ \quad \operatorname{softmax}(x):=\frac{f(x)}{\ell(x)} \end{gather*} - ``` + ::: ### `qk_max` and `logits` @@ -379,29 +379,29 @@ ## Value -```{figure} ../../assets/kernel/value.png +:::{figure} ../../assets/kernel/value.png :align: center :alt: value :width: 70% Value data of all context tokens at one head -``` +::: -```{figure} ../../assets/kernel/logits_vec.png +:::{figure} ../../assets/kernel/logits_vec.png :align: center :alt: logits_vec :width: 50% `logits_vec` for one thread -``` +::: -```{figure} ../../assets/kernel/v_vec.png +:::{figure} ../../assets/kernel/v_vec.png :align: center :alt: v_vec :width: 70% List of `v_vec` for one thread -``` +::: - Now we need to retrieve the value data and perform dot multiplication with `logits`. Unlike query and key, there is no thread group diff --git a/docs/source/design/multiprocessing.md b/docs/source/design/multiprocessing.md index c2cdb75ea08a7..55dae0bb92d4e 100644 --- a/docs/source/design/multiprocessing.md +++ b/docs/source/design/multiprocessing.md @@ -7,9 +7,9 @@ page for information on known issues and how to solve them. ## Introduction -```{important} +:::{important} The source code references are to the state of the code at the time of writing in December, 2024. -``` +::: The use of Python multiprocessing in vLLM is complicated by: diff --git a/docs/source/features/automatic_prefix_caching.md b/docs/source/features/automatic_prefix_caching.md index 3d70cbb29c385..59016d7fcf6b3 100644 --- a/docs/source/features/automatic_prefix_caching.md +++ b/docs/source/features/automatic_prefix_caching.md @@ -6,9 +6,9 @@ Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part. -```{note} +:::{note} Technical details on how vLLM implements APC can be found [here](#design-automatic-prefix-caching). -``` +::: ## Enabling APC in vLLM diff --git a/docs/source/features/compatibility_matrix.md b/docs/source/features/compatibility_matrix.md index 47ab616b30686..b0018ebccf5ba 100644 --- a/docs/source/features/compatibility_matrix.md +++ b/docs/source/features/compatibility_matrix.md @@ -4,13 +4,13 @@ The tables below show mutually exclusive features and the support on some hardware. -```{note} +:::{note} Check the '✗' with links to see tracking issue for unsupported feature/hardware combination. -``` +::: ## Feature x Feature -```{raw} html +:::{raw} html -``` +::: -```{list-table} - :header-rows: 1 - :stub-columns: 1 - :widths: auto +:::{list-table} +:header-rows: 1 +:stub-columns: 1 +:widths: auto - * - Feature - - [CP](#chunked-prefill) - - [APC](#automatic-prefix-caching) - - [LoRA](#lora-adapter) - - prmpt adptr - - [SD](#spec_decode) - - CUDA graph - - pooling - - enc-dec - - logP - - prmpt logP - - async output - - multi-step - - mm - - best-of - - beam-search - - guided dec - * - [CP](#chunked-prefill) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - * - [APC](#automatic-prefix-caching) - - ✅ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - * - [LoRA](#lora-adapter) - - [✗](gh-pr:9057) - - ✅ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - * - prmpt adptr - - ✅ - - ✅ - - ✅ - - - - - - - - - - - - - - - - - - - - - - - - - - - * - [SD](#spec_decode) - - ✅ - - ✅ - - ✗ - - ✅ - - - - - - - - - - - - - - - - - - - - - - - - - * - CUDA graph - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - - - - - - - - - - - - - - - - - - - - - - * - pooling - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ - - - - - - - - - - - - - - - - - - - - - * - enc-dec - - ✗ - - [✗](gh-issue:7366) - - ✗ - - ✗ - - [✗](gh-issue:7366) - - ✅ - - ✅ - - - - - - - - - - - - - - - - - - - * - logP - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✗ - - ✅ - - - - - - - - - - - - - - - - - * - prmpt logP - - ✅ - - ✅ - - ✅ - - ✅ - - [✗](gh-pr:8199) - - ✅ - - ✗ - - ✅ - - ✅ - - - - - - - - - - - - - - - * - async output - - ✅ - - ✅ - - ✅ - - ✅ - - ✗ - - ✅ - - ✗ - - ✗ - - ✅ - - ✅ - - - - - - - - - - - - - * - multi-step - - ✗ - - ✅ - - ✗ - - ✅ - - ✗ - - ✅ - - ✗ - - ✗ - - ✅ - - [✗](gh-issue:8198) - - ✅ - - - - - - - - - - - * - mm - - ✅ - - [✗](gh-pr:8348) - - [✗](gh-pr:7199) - - ? - - ? - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ? - - - - - - - - - * - best-of - - ✅ - - ✅ - - ✅ - - ✅ - - [✗](gh-issue:6137) - - ✅ - - ✗ - - ✅ - - ✅ - - ✅ - - ? - - [✗](gh-issue:7968) - - ✅ - - - - - - - * - beam-search - - ✅ - - ✅ - - ✅ - - ✅ - - [✗](gh-issue:6137) - - ✅ - - ✗ - - ✅ - - ✅ - - ✅ - - ? - - [✗](gh-issue:7968>) - - ? - - ✅ - - - - - * - guided dec - - ✅ - - ✅ - - ? - - ? - - [✗](gh-issue:11484) - - ✅ - - ✗ - - ? - - ✅ - - ✅ - - ✅ - - [✗](gh-issue:9893) - - ? - - ✅ - - ✅ - - - -``` +- * Feature + * [CP](#chunked-prefill) + * [APC](#automatic-prefix-caching) + * [LoRA](#lora-adapter) + * prmpt adptr + * [SD](#spec_decode) + * CUDA graph + * pooling + * enc-dec + * logP + * prmpt logP + * async output + * multi-step + * mm + * best-of + * beam-search + * guided dec +- * [CP](#chunked-prefill) + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * +- * [APC](#automatic-prefix-caching) + * ✅ + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * +- * [LoRA](#lora-adapter) + * [✗](gh-pr:9057) + * ✅ + * + * + * + * + * + * + * + * + * + * + * + * + * + * +- * prmpt adptr + * ✅ + * ✅ + * ✅ + * + * + * + * + * + * + * + * + * + * + * + * + * +- * [SD](#spec_decode) + * ✅ + * ✅ + * ✗ + * ✅ + * + * + * + * + * + * + * + * + * + * + * + * +- * CUDA graph + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * + * + * + * + * + * + * + * + * + * + * +- * pooling + * ✗ + * ✗ + * ✗ + * ✗ + * ✗ + * ✗ + * + * + * + * + * + * + * + * + * + * +- * enc-dec + * ✗ + * [✗](gh-issue:7366) + * ✗ + * ✗ + * [✗](gh-issue:7366) + * ✅ + * ✅ + * + * + * + * + * + * + * + * + * +- * logP + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✗ + * ✅ + * + * + * + * + * + * + * + * +- * prmpt logP + * ✅ + * ✅ + * ✅ + * ✅ + * [✗](gh-pr:8199) + * ✅ + * ✗ + * ✅ + * ✅ + * + * + * + * + * + * + * +- * async output + * ✅ + * ✅ + * ✅ + * ✅ + * ✗ + * ✅ + * ✗ + * ✗ + * ✅ + * ✅ + * + * + * + * + * + * +- * multi-step + * ✗ + * ✅ + * ✗ + * ✅ + * ✗ + * ✅ + * ✗ + * ✗ + * ✅ + * [✗](gh-issue:8198) + * ✅ + * + * + * + * + * +- * mm + * ✅ + * [✗](gh-pr:8348) + * [✗](gh-pr:7199) + * ? + * ? + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ? + * + * + * + * +- * best-of + * ✅ + * ✅ + * ✅ + * ✅ + * [✗](gh-issue:6137) + * ✅ + * ✗ + * ✅ + * ✅ + * ✅ + * ? + * [✗](gh-issue:7968) + * ✅ + * + * + * +- * beam-search + * ✅ + * ✅ + * ✅ + * ✅ + * [✗](gh-issue:6137) + * ✅ + * ✗ + * ✅ + * ✅ + * ✅ + * ? + * [✗](gh-issue:7968>) + * ? + * ✅ + * + * +- * guided dec + * ✅ + * ✅ + * ? + * ? + * [✗](gh-issue:11484) + * ✅ + * ✗ + * ? + * ✅ + * ✅ + * ✅ + * [✗](gh-issue:9893) + * ? + * ✅ + * ✅ + * +::: (feature-x-hardware)= ## Feature x Hardware -```{list-table} - :header-rows: 1 - :stub-columns: 1 - :widths: auto +:::{list-table} +:header-rows: 1 +:stub-columns: 1 +:widths: auto - * - Feature - - Volta - - Turing - - Ampere - - Ada - - Hopper - - CPU - - AMD - * - [CP](#chunked-prefill) - - [✗](gh-issue:2729) - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - * - [APC](#automatic-prefix-caching) - - [✗](gh-issue:3687) - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - * - [LoRA](#lora-adapter) - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - * - prmpt adptr - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - [✗](gh-issue:8475) - - ✅ - * - [SD](#spec_decode) - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - * - CUDA graph - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✗ - - ✅ - * - pooling - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ? - * - enc-dec - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✗ - * - mm - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - * - logP - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - * - prmpt logP - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - * - async output - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✗ - - ✗ - * - multi-step - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - [✗](gh-issue:8477) - - ✅ - * - best-of - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - * - beam-search - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - * - guided dec - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ -``` +- * Feature + * Volta + * Turing + * Ampere + * Ada + * Hopper + * CPU + * AMD +- * [CP](#chunked-prefill) + * [✗](gh-issue:2729) + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ +- * [APC](#automatic-prefix-caching) + * [✗](gh-issue:3687) + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ +- * [LoRA](#lora-adapter) + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ +- * prmpt adptr + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * [✗](gh-issue:8475) + * ✅ +- * [SD](#spec_decode) + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ +- * CUDA graph + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✗ + * ✅ +- * pooling + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ? +- * enc-dec + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✗ +- * mm + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ +- * logP + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ +- * prmpt logP + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ +- * async output + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✗ + * ✗ +- * multi-step + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * [✗](gh-issue:8477) + * ✅ +- * best-of + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ +- * beam-search + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ +- * guided dec + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ +::: diff --git a/docs/source/features/disagg_prefill.md b/docs/source/features/disagg_prefill.md index efa2efc66192e..52d253b9c2b18 100644 --- a/docs/source/features/disagg_prefill.md +++ b/docs/source/features/disagg_prefill.md @@ -4,9 +4,9 @@ This page introduces you the disaggregated prefilling feature in vLLM. -```{note} +:::{note} This feature is experimental and subject to change. -``` +::: ## Why disaggregated prefilling? @@ -15,9 +15,9 @@ Two main reasons: - **Tuning time-to-first-token (TTFT) and inter-token-latency (ITL) separately**. Disaggregated prefilling put prefill and decode phase of LLM inference inside different vLLM instances. This gives you the flexibility to assign different parallel strategies (e.g. `tp` and `pp`) to tune TTFT without affecting ITL, or to tune ITL without affecting TTFT. - **Controlling tail ITL**. Without disaggregated prefilling, vLLM may insert some prefill jobs during the decoding of one request. This results in higher tail latency. Disaggregated prefilling helps you solve this issue and control tail ITL. Chunked prefill with a proper chunk size also can achieve the same goal, but in practice it's hard to figure out the correct chunk size value. So disaggregated prefilling is a much more reliable way to control tail ITL. -```{note} +:::{note} Disaggregated prefill DOES NOT improve throughput. -``` +::: ## Usage example @@ -39,21 +39,21 @@ Key abstractions for disaggregated prefilling: - **LookupBuffer**: LookupBuffer provides two API: `insert` KV cache and `drop_select` KV cache. The semantics of `insert` and `drop_select` are similar to SQL, where `insert` inserts a KV cache into the buffer, and `drop_select` returns the KV cache that matches the given condition and drop it from the buffer. - **Pipe**: A single-direction FIFO pipe for tensor transmission. It supports `send_tensor` and `recv_tensor`. -```{note} +:::{note} `insert` is non-blocking operation but `drop_select` is blocking operation. -``` +::: Here is a figure illustrating how the above 3 abstractions are organized: -```{image} /assets/features/disagg_prefill/abstraction.jpg +:::{image} /assets/features/disagg_prefill/abstraction.jpg :alt: Disaggregated prefilling abstractions -``` +::: The workflow of disaggregated prefilling is as follows: -```{image} /assets/features/disagg_prefill/overview.jpg +:::{image} /assets/features/disagg_prefill/overview.jpg :alt: Disaggregated prefilling workflow -``` +::: The `buffer` corresponds to `insert` API in LookupBuffer, and the `drop_select` corresponds to `drop_select` API in LookupBuffer. diff --git a/docs/source/features/lora.md b/docs/source/features/lora.md index b00d05147bb32..fb5a7a0d519cb 100644 --- a/docs/source/features/lora.md +++ b/docs/source/features/lora.md @@ -60,9 +60,9 @@ vllm serve meta-llama/Llama-2-7b-hf \ --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/ ``` -```{note} +:::{note} The commit ID `0dfa347e8877a4d4ed19ee56c140fa518470028c` may change over time. Please check the latest commit ID in your environment to ensure you are using the correct one. -``` +::: The server entrypoint accepts all other LoRA configuration parameters (`max_loras`, `max_lora_rank`, `max_cpu_loras`, etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along diff --git a/docs/source/features/quantization/auto_awq.md b/docs/source/features/quantization/auto_awq.md index 404505eb3890e..30735b1161ff3 100644 --- a/docs/source/features/quantization/auto_awq.md +++ b/docs/source/features/quantization/auto_awq.md @@ -2,11 +2,11 @@ # AutoAWQ -```{warning} +:::{warning} Please note that AWQ support in vLLM is under-optimized at the moment. We would recommend using the unquantized version of the model for better accuracy and higher throughput. Currently, you can use AWQ as a way to reduce memory footprint. As of now, it is more suitable for low latency inference with small number of concurrent requests. vLLM's AWQ implementation have lower throughput than unquantized version. -``` +::: To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github.com/casper-hansen/AutoAWQ). Quantizing reduces the model's precision from FP16 to INT4 which effectively reduces the file size by ~70%. diff --git a/docs/source/features/quantization/fp8.md b/docs/source/features/quantization/fp8.md index 1398e8a324201..a62e0124b7706 100644 --- a/docs/source/features/quantization/fp8.md +++ b/docs/source/features/quantization/fp8.md @@ -14,10 +14,10 @@ The FP8 types typically supported in hardware have two distinct representations, - **E4M3**: Consists of 1 sign bit, 4 exponent bits, and 3 bits of mantissa. It can store values up to +/-448 and `nan`. - **E5M2**: Consists of 1 sign bit, 5 exponent bits, and 2 bits of mantissa. It can store values up to +/-57344, +/- `inf`, and `nan`. The tradeoff for the increased dynamic range is lower precision of the stored values. -```{note} +:::{note} FP8 computation is supported on NVIDIA GPUs with compute capability > 8.9 (Ada Lovelace, Hopper). FP8 models will run on compute capability > 8.0 (Ampere) as weight-only W8A16, utilizing FP8 Marlin. -``` +::: ## Quick Start with Online Dynamic Quantization @@ -32,9 +32,9 @@ model = LLM("facebook/opt-125m", quantization="fp8") result = model.generate("Hello, my name is") ``` -```{warning} +:::{warning} Currently, we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model. -``` +::: ## Installation @@ -110,9 +110,9 @@ model.generate("Hello my name is") Evaluate accuracy with `lm_eval` (for example on 250 samples of `gsm8k`): -```{note} +:::{note} Quantized models can be sensitive to the presence of the `bos` token. `lm_eval` does not add a `bos` token by default, so make sure to include the `add_bos_token=True` argument when running your evaluations. -``` +::: ```console $ MODEL=$PWD/Meta-Llama-3-8B-Instruct-FP8-Dynamic @@ -137,10 +137,10 @@ If you encounter any issues or have feature requests, please open an issue on th ## Deprecated Flow -```{note} +:::{note} The following information is preserved for reference and search purposes. The quantization method described below is deprecated in favor of the `llmcompressor` method described above. -``` +::: For static per-tensor offline quantization to FP8, please install the [AutoFP8 library](https://github.com/neuralmagic/autofp8). diff --git a/docs/source/features/quantization/gguf.md b/docs/source/features/quantization/gguf.md index 640997cf4bc39..65c181900f9be 100644 --- a/docs/source/features/quantization/gguf.md +++ b/docs/source/features/quantization/gguf.md @@ -2,13 +2,13 @@ # GGUF -```{warning} +:::{warning} Please note that GGUF support in vLLM is highly experimental and under-optimized at the moment, it might be incompatible with other features. Currently, you can use GGUF as a way to reduce memory footprint. If you encounter any issues, please report them to the vLLM team. -``` +::: -```{warning} +:::{warning} Currently, vllm only supports loading single-file GGUF models. If you have a multi-files GGUF model, you can use [gguf-split](https://github.com/ggerganov/llama.cpp/pull/6135) tool to merge them to a single-file model. -``` +::: To run a GGUF model with vLLM, you can download and use the local GGUF model from [TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF) with the following command: @@ -25,9 +25,9 @@ You can also add `--tensor-parallel-size 2` to enable tensor parallelism inferen vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2 ``` -```{warning} +:::{warning} We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size. -``` +::: You can also use the GGUF model directly through the LLM entrypoint: diff --git a/docs/source/features/quantization/index.md b/docs/source/features/quantization/index.md index 56ccdb5f00c34..d972dc85fc23c 100644 --- a/docs/source/features/quantization/index.md +++ b/docs/source/features/quantization/index.md @@ -4,7 +4,7 @@ Quantization trades off model precision for smaller memory footprint, allowing large models to be run on a wider range of devices. -```{toctree} +:::{toctree} :caption: Contents :maxdepth: 1 @@ -15,4 +15,4 @@ gguf int8 fp8 quantized_kvcache -``` +::: diff --git a/docs/source/features/quantization/int8.md b/docs/source/features/quantization/int8.md index 592a60d3988b2..fedb16f4350e5 100644 --- a/docs/source/features/quantization/int8.md +++ b/docs/source/features/quantization/int8.md @@ -7,9 +7,9 @@ This quantization method is particularly useful for reducing model size while ma Please visit the HF collection of [quantized INT8 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/int8-llms-for-vllm-668ec32c049dca0369816415). -```{note} +:::{note} INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper). -``` +::: ## Prerequisites @@ -119,9 +119,9 @@ $ lm_eval --model vllm \ --batch_size 'auto' ``` -```{note} +:::{note} Quantized models can be sensitive to the presence of the `bos` token. Make sure to include the `add_bos_token=True` argument when running evaluations. -``` +::: ## Best Practices diff --git a/docs/source/features/quantization/supported_hardware.md b/docs/source/features/quantization/supported_hardware.md index f5c0a95ea426e..555ed4ce4c8db 100644 --- a/docs/source/features/quantization/supported_hardware.md +++ b/docs/source/features/quantization/supported_hardware.md @@ -4,128 +4,129 @@ The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM: -```{list-table} +:::{list-table} :header-rows: 1 :widths: 20 8 8 8 8 8 8 8 8 8 8 -* - Implementation - - Volta - - Turing - - Ampere - - Ada - - Hopper - - AMD GPU - - Intel GPU - - x86 CPU - - AWS Inferentia - - Google TPU -* - AWQ - - ✗ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✅︎ - - ✅︎ - - ✗ - - ✗ -* - GPTQ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✅︎ - - ✅︎ - - ✗ - - ✗ -* - Marlin (GPTQ/AWQ/FP8) - - ✗ - - ✗ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ -* - INT8 (W8A8) - - ✗ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✅︎ - - ✗ - - ✗ -* - FP8 (W8A8) - - ✗ - - ✗ - - ✗ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ -* - AQLM - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ -* - bitsandbytes - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ -* - DeepSpeedFP - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ -* - GGUF - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ -``` +- * Implementation + * Volta + * Turing + * Ampere + * Ada + * Hopper + * AMD GPU + * Intel GPU + * x86 CPU + * AWS Inferentia + * Google TPU +- * AWQ + * ✗ + * ✅︎ + * ✅︎ + * ✅︎ + * ✅︎ + * ✗ + * ✅︎ + * ✅︎ + * ✗ + * ✗ +- * GPTQ + * ✅︎ + * ✅︎ + * ✅︎ + * ✅︎ + * ✅︎ + * ✗ + * ✅︎ + * ✅︎ + * ✗ + * ✗ +- * Marlin (GPTQ/AWQ/FP8) + * ✗ + * ✗ + * ✅︎ + * ✅︎ + * ✅︎ + * ✗ + * ✗ + * ✗ + * ✗ + * ✗ +- * INT8 (W8A8) + * ✗ + * ✅︎ + * ✅︎ + * ✅︎ + * ✅︎ + * ✗ + * ✗ + * ✅︎ + * ✗ + * ✗ +- * FP8 (W8A8) + * ✗ + * ✗ + * ✗ + * ✅︎ + * ✅︎ + * ✅︎ + * ✗ + * ✗ + * ✗ + * ✗ +- * AQLM + * ✅︎ + * ✅︎ + * ✅︎ + * ✅︎ + * ✅︎ + * ✗ + * ✗ + * ✗ + * ✗ + * ✗ +- * bitsandbytes + * ✅︎ + * ✅︎ + * ✅︎ + * ✅︎ + * ✅︎ + * ✗ + * ✗ + * ✗ + * ✗ + * ✗ +- * DeepSpeedFP + * ✅︎ + * ✅︎ + * ✅︎ + * ✅︎ + * ✅︎ + * ✗ + * ✗ + * ✗ + * ✗ + * ✗ +- * GGUF + * ✅︎ + * ✅︎ + * ✅︎ + * ✅︎ + * ✅︎ + * ✅︎ + * ✗ + * ✗ + * ✗ + * ✗ + +::: - Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0. - "✅︎" indicates that the quantization method is supported on the specified hardware. - "✗" indicates that the quantization method is not supported on the specified hardware. -```{note} +:::{note} This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods. For the most up-to-date information on hardware support and quantization methods, please refer to or consult with the vLLM development team. -``` +::: diff --git a/docs/source/features/spec_decode.md b/docs/source/features/spec_decode.md index ab7b2f302bd13..da87127057dc5 100644 --- a/docs/source/features/spec_decode.md +++ b/docs/source/features/spec_decode.md @@ -2,15 +2,15 @@ # Speculative Decoding -```{warning} +:::{warning} Please note that speculative decoding in vLLM is not yet optimized and does not usually yield inter-token latency reductions for all prompt datasets or sampling parameters. The work to optimize it is ongoing and can be followed here: -``` +::: -```{warning} +:::{warning} Currently, speculative decoding in vLLM is not compatible with pipeline parallelism. -``` +::: This document shows how to use [Speculative Decoding](https://x.com/karpathy/status/1697318534555336961) with vLLM. Speculative decoding is a technique which improves inter-token latency in memory-bound LLM inference. diff --git a/docs/source/features/structured_outputs.md b/docs/source/features/structured_outputs.md index 1d77c7339a33f..90c880e8cfa46 100644 --- a/docs/source/features/structured_outputs.md +++ b/docs/source/features/structured_outputs.md @@ -95,10 +95,10 @@ completion = client.chat.completions.create( print(completion.choices[0].message.content) ``` -```{tip} +:::{tip} While not strictly necessary, normally it´s better to indicate in the prompt that a JSON needs to be generated and which fields and how should the LLM fill them. This can improve the results notably in most cases. -``` +::: Finally we have the `guided_grammar`, which probably is the most difficult one to use but it´s really powerful, as it allows us to define complete languages like SQL queries. It works by using a context free EBNF grammar, which for example we can use to define a specific format of simplified SQL queries, like in the example below: diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py index aaa13d0fb6d3f..ac592e22328da 100644 --- a/docs/source/generate_examples.py +++ b/docs/source/generate_examples.py @@ -57,9 +57,9 @@ class Index: def generate(self) -> str: content = f"# {self.title}\n\n{self.description}\n\n" - content += "```{toctree}\n" + content += ":::{toctree}\n" content += f":caption: {self.caption}\n:maxdepth: {self.maxdepth}\n" - content += "\n".join(self.documents) + "\n```\n" + content += "\n".join(self.documents) + "\n:::\n" return content diff --git a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md index ae42dd0c0d08f..704a16233981f 100644 --- a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md +++ b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md @@ -86,9 +86,9 @@ docker build -f Dockerfile.hpu -t vllm-hpu-env . docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env ``` -```{tip} +:::{tip} If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered. -``` +::: ## Extra information @@ -155,30 +155,30 @@ Gaudi2 devices. Configurations that are not listed may or may not work. Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment variable), and `--enforce-eager` flag. -```{list-table} vLLM execution modes +:::{list-table} vLLM execution modes :widths: 25 25 50 :header-rows: 1 -* - `PT_HPU_LAZY_MODE` - - `enforce_eager` - - execution mode -* - 0 - - 0 - - torch.compile -* - 0 - - 1 - - PyTorch eager mode -* - 1 - - 0 - - HPU Graphs -* - 1 - - 1 - - PyTorch lazy mode -``` - -```{warning} +- * `PT_HPU_LAZY_MODE` + * `enforce_eager` + * execution mode +- * 0 + * 0 + * torch.compile +- * 0 + * 1 + * PyTorch eager mode +- * 1 + * 0 + * HPU Graphs +- * 1 + * 1 + * PyTorch lazy mode +::: + +:::{warning} In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode. -``` +::: (gaudi-bucketing-mechanism)= @@ -187,9 +187,9 @@ In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. [Intel Gaudi Graph Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime) is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution. In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - `batch_size` and `sequence_length`. -```{note} +:::{note} Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase. -``` +::: Bucketing ranges are determined with 3 parameters - `min`, `step` and `max`. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup: @@ -222,15 +222,15 @@ min = 128, step = 128, max = 512 In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket. -```{warning} +:::{warning} If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario. -``` +::: As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as `(4, 512)` prefill bucket, as `batch_size` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as `(4, 512)` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a `(2, 512)` bucket, or context length increases above 512 tokens, in which case it will become `(4, 640)` bucket. -```{note} +:::{note} Bucketing is transparent to a client -- padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests. -``` +::: ### Warmup @@ -252,9 +252,9 @@ INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size This example uses the same buckets as in the [Bucketing Mechanism](#gaudi-bucketing-mechanism) section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations. -```{tip} +:::{tip} Compiling all the buckets might take some time and can be turned off with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment. -``` +::: ### HPU Graph capture @@ -269,9 +269,9 @@ With its default value (`VLLM_GRAPH_RESERVED_MEM=0.1`), 10% of usable memory wil Environment variable `VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (`VLLM_GRAPH_PROMPT_RATIO=0.3`), both stages have equal memory constraints. Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. `VLLM_GRAPH_PROMPT_RATIO=0.2` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs. -```{note} +:::{note} `gpu_memory_utilization` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, `gpu_memory_utilization` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory. -``` +::: User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented: \- `max_bs` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. `(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, `(1,256)`), default strategy for decode @@ -279,9 +279,9 @@ User can also configure the strategy for capturing HPU Graphs for prompt and dec When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by `max_bs` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in `min_tokens` strategy. -```{note} +:::{note} `VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * `VLLM_GRAPH_PROMPT_RATIO`) for capturing prefill HPU Graphs, next it will attempt do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below. -``` +::: Each described step is logged by vLLM server, as follows (negative values correspond to memory being released): @@ -352,13 +352,13 @@ INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of devi - `VLLM_{phase}_{dim}_BUCKET_{param}` - collection of 12 environment variables configuring ranges of bucketing mechanism - - `{phase}` is either `PROMPT` or `DECODE` + * `{phase}` is either `PROMPT` or `DECODE` - - `{dim}` is either `BS`, `SEQ` or `BLOCK` + * `{dim}` is either `BS`, `SEQ` or `BLOCK` - - `{param}` is either `MIN`, `STEP` or `MAX` + * `{param}` is either `MIN`, `STEP` or `MAX` - - Default values: + * Default values: - Prompt: - batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1` diff --git a/docs/source/getting_started/installation/ai_accelerator/index.md b/docs/source/getting_started/installation/ai_accelerator/index.md index a6c4c44305a4c..88352f639567b 100644 --- a/docs/source/getting_started/installation/ai_accelerator/index.md +++ b/docs/source/getting_started/installation/ai_accelerator/index.md @@ -2,374 +2,374 @@ vLLM is a Python library that supports the following AI accelerators. Select your AI accelerator type to see vendor specific instructions: -::::{tab-set} +:::::{tab-set} :sync-group: device -:::{tab-item} TPU +::::{tab-item} TPU :sync: tpu -```{include} tpu.inc.md +:::{include} tpu.inc.md :start-after: "# Installation" :end-before: "## Requirements" -``` - ::: -:::{tab-item} Intel Gaudi +:::: + +::::{tab-item} Intel Gaudi :sync: hpu-gaudi -```{include} hpu-gaudi.inc.md +:::{include} hpu-gaudi.inc.md :start-after: "# Installation" :end-before: "## Requirements" -``` - ::: -:::{tab-item} Neuron +:::: + +::::{tab-item} Neuron :sync: neuron -```{include} neuron.inc.md +:::{include} neuron.inc.md :start-after: "# Installation" :end-before: "## Requirements" -``` - ::: -:::{tab-item} OpenVINO +:::: + +::::{tab-item} OpenVINO :sync: openvino -```{include} openvino.inc.md +:::{include} openvino.inc.md :start-after: "# Installation" :end-before: "## Requirements" -``` - ::: :::: +::::: + ## Requirements -::::{tab-set} +:::::{tab-set} :sync-group: device -:::{tab-item} TPU +::::{tab-item} TPU :sync: tpu -```{include} tpu.inc.md +:::{include} tpu.inc.md :start-after: "## Requirements" :end-before: "## Configure a new environment" -``` - ::: -:::{tab-item} Intel Gaudi +:::: + +::::{tab-item} Intel Gaudi :sync: hpu-gaudi -```{include} hpu-gaudi.inc.md +:::{include} hpu-gaudi.inc.md :start-after: "## Requirements" :end-before: "## Configure a new environment" -``` - ::: -:::{tab-item} Neuron +:::: + +::::{tab-item} Neuron :sync: neuron -```{include} neuron.inc.md +:::{include} neuron.inc.md :start-after: "## Requirements" :end-before: "## Configure a new environment" -``` - ::: -:::{tab-item} OpenVINO +:::: + +::::{tab-item} OpenVINO :sync: openvino -```{include} openvino.inc.md +:::{include} openvino.inc.md :start-after: "## Requirements" :end-before: "## Set up using Python" -``` - ::: :::: +::::: + ## Configure a new environment -::::{tab-set} +:::::{tab-set} :sync-group: device -:::{tab-item} TPU +::::{tab-item} TPU :sync: tpu -```{include} tpu.inc.md +:::{include} tpu.inc.md :start-after: "## Configure a new environment" :end-before: "## Set up using Python" -``` - ::: -:::{tab-item} Intel Gaudi +:::: + +::::{tab-item} Intel Gaudi :sync: hpu-gaudi -```{include} hpu-gaudi.inc.md +:::{include} hpu-gaudi.inc.md :start-after: "## Configure a new environment" :end-before: "## Set up using Python" -``` - ::: -:::{tab-item} Neuron +:::: + +::::{tab-item} Neuron :sync: neuron -```{include} neuron.inc.md +:::{include} neuron.inc.md :start-after: "## Configure a new environment" :end-before: "## Set up using Python" -``` - ::: -:::{tab-item} OpenVINO -:sync: openvino +:::: -```{include} ../python_env_setup.inc.md -``` +::::{tab-item} OpenVINO +:sync: openvino +:::{include} ../python_env_setup.inc.md ::: :::: +::::: + ## Set up using Python ### Pre-built wheels -::::{tab-set} +:::::{tab-set} :sync-group: device -:::{tab-item} TPU +::::{tab-item} TPU :sync: tpu -```{include} tpu.inc.md +:::{include} tpu.inc.md :start-after: "### Pre-built wheels" :end-before: "### Build wheel from source" -``` - ::: -:::{tab-item} Intel Gaudi +:::: + +::::{tab-item} Intel Gaudi :sync: hpu-gaudi -```{include} hpu-gaudi.inc.md +:::{include} hpu-gaudi.inc.md :start-after: "### Pre-built wheels" :end-before: "### Build wheel from source" -``` - ::: -:::{tab-item} Neuron +:::: + +::::{tab-item} Neuron :sync: neuron -```{include} neuron.inc.md +:::{include} neuron.inc.md :start-after: "### Pre-built wheels" :end-before: "### Build wheel from source" -``` - ::: -:::{tab-item} OpenVINO +:::: + +::::{tab-item} OpenVINO :sync: openvino -```{include} openvino.inc.md +:::{include} openvino.inc.md :start-after: "### Pre-built wheels" :end-before: "### Build wheel from source" -``` - ::: :::: +::::: + ### Build wheel from source -::::{tab-set} +:::::{tab-set} :sync-group: device -:::{tab-item} TPU +::::{tab-item} TPU :sync: tpu -```{include} tpu.inc.md +:::{include} tpu.inc.md :start-after: "### Build wheel from source" :end-before: "## Set up using Docker" -``` - ::: -:::{tab-item} Intel Gaudi +:::: + +::::{tab-item} Intel Gaudi :sync: hpu-gaudi -```{include} hpu-gaudi.inc.md +:::{include} hpu-gaudi.inc.md :start-after: "### Build wheel from source" :end-before: "## Set up using Docker" -``` - ::: -:::{tab-item} Neuron +:::: + +::::{tab-item} Neuron :sync: neuron -```{include} neuron.inc.md +:::{include} neuron.inc.md :start-after: "### Build wheel from source" :end-before: "## Set up using Docker" -``` - ::: -:::{tab-item} OpenVINO +:::: + +::::{tab-item} OpenVINO :sync: openvino -```{include} openvino.inc.md +:::{include} openvino.inc.md :start-after: "### Build wheel from source" :end-before: "## Set up using Docker" -``` - ::: :::: +::::: + ## Set up using Docker ### Pre-built images -::::{tab-set} +:::::{tab-set} :sync-group: device -:::{tab-item} TPU +::::{tab-item} TPU :sync: tpu -```{include} tpu.inc.md +:::{include} tpu.inc.md :start-after: "### Pre-built images" :end-before: "### Build image from source" -``` - ::: -:::{tab-item} Intel Gaudi +:::: + +::::{tab-item} Intel Gaudi :sync: hpu-gaudi -```{include} hpu-gaudi.inc.md +:::{include} hpu-gaudi.inc.md :start-after: "### Pre-built images" :end-before: "### Build image from source" -``` - ::: -:::{tab-item} Neuron +:::: + +::::{tab-item} Neuron :sync: neuron -```{include} neuron.inc.md +:::{include} neuron.inc.md :start-after: "### Pre-built images" :end-before: "### Build image from source" -``` - ::: -:::{tab-item} OpenVINO +:::: + +::::{tab-item} OpenVINO :sync: openvino -```{include} openvino.inc.md +:::{include} openvino.inc.md :start-after: "### Pre-built images" :end-before: "### Build image from source" -``` - ::: :::: +::::: + ### Build image from source -::::{tab-set} +:::::{tab-set} :sync-group: device -:::{tab-item} TPU +::::{tab-item} TPU :sync: tpu -```{include} tpu.inc.md +:::{include} tpu.inc.md :start-after: "### Build image from source" :end-before: "## Extra information" -``` - ::: -:::{tab-item} Intel Gaudi +:::: + +::::{tab-item} Intel Gaudi :sync: hpu-gaudi -```{include} hpu-gaudi.inc.md +:::{include} hpu-gaudi.inc.md :start-after: "### Build image from source" :end-before: "## Extra information" -``` - ::: -:::{tab-item} Neuron +:::: + +::::{tab-item} Neuron :sync: neuron -```{include} neuron.inc.md +:::{include} neuron.inc.md :start-after: "### Build image from source" :end-before: "## Extra information" -``` - ::: -:::{tab-item} OpenVINO +:::: + +::::{tab-item} OpenVINO :sync: openvino -```{include} openvino.inc.md +:::{include} openvino.inc.md :start-after: "### Build image from source" :end-before: "## Extra information" -``` - ::: :::: +::::: + ## Extra information -::::{tab-set} +:::::{tab-set} :sync-group: device -:::{tab-item} TPU +::::{tab-item} TPU :sync: tpu -```{include} tpu.inc.md +:::{include} tpu.inc.md :start-after: "## Extra information" -``` - ::: -:::{tab-item} Intel Gaudi +:::: + +::::{tab-item} Intel Gaudi :sync: hpu-gaudi -```{include} hpu-gaudi.inc.md +:::{include} hpu-gaudi.inc.md :start-after: "## Extra information" -``` - ::: -:::{tab-item} Neuron +:::: + +::::{tab-item} Neuron :sync: neuron -```{include} neuron.inc.md +:::{include} neuron.inc.md :start-after: "## Extra information" -``` - ::: -:::{tab-item} OpenVINO +:::: + +::::{tab-item} OpenVINO :sync: openvino -```{include} openvino.inc.md +:::{include} openvino.inc.md :start-after: "## Extra information" -``` - ::: :::: + +::::: diff --git a/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md index 575a9f9c2e2f0..145cc9d668efd 100644 --- a/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md +++ b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md @@ -67,9 +67,9 @@ Currently, there are no pre-built Neuron wheels. ### Build wheel from source -```{note} +:::{note} The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with `vllm >= 0.5.3`. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel. -``` +::: Following instructions are applicable to Neuron SDK 2.16 and beyond. diff --git a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md index 6a911cc6b9eba..6827afc805fd8 100644 --- a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md +++ b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md @@ -47,10 +47,10 @@ When you request queued resources, the request is added to a queue maintained by the Cloud TPU service. When the requested resource becomes available, it's assigned to your Google Cloud project for your immediate exclusive use. -```{note} +:::{note} In all of the following commands, replace the ALL CAPS parameter names with appropriate values. See the parameter descriptions table for more information. -``` +::: ### Provision Cloud TPUs with GKE @@ -75,33 +75,33 @@ gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \ --service-account SERVICE_ACCOUNT ``` -```{list-table} Parameter descriptions +:::{list-table} Parameter descriptions :header-rows: 1 -* - Parameter name - - Description -* - QUEUED_RESOURCE_ID - - The user-assigned ID of the queued resource request. -* - TPU_NAME - - The user-assigned name of the TPU which is created when the queued +- * Parameter name + * Description +- * QUEUED_RESOURCE_ID + * The user-assigned ID of the queued resource request. +- * TPU_NAME + * The user-assigned name of the TPU which is created when the queued resource request is allocated. -* - PROJECT_ID - - Your Google Cloud project -* - ZONE - - The GCP zone where you want to create your Cloud TPU. The value you use +- * PROJECT_ID + * Your Google Cloud project +- * ZONE + * The GCP zone where you want to create your Cloud TPU. The value you use depends on the version of TPUs you are using. For more information, see `TPU regions and zones `_ -* - ACCELERATOR_TYPE - - The TPU version you want to use. Specify the TPU version, for example +- * ACCELERATOR_TYPE + * The TPU version you want to use. Specify the TPU version, for example `v5litepod-4` specifies a v5e TPU with 4 cores. For more information, see `TPU versions `_. -* - RUNTIME_VERSION - - The TPU VM runtime version to use. For more information see `TPU VM images `_. -* - SERVICE_ACCOUNT - - The email address for your service account. You can find it in the IAM +- * RUNTIME_VERSION + * The TPU VM runtime version to use. For more information see `TPU VM images `_. +- * SERVICE_ACCOUNT + * The email address for your service account. You can find it in the IAM Cloud Console under *Service Accounts*. For example: `tpu-service-account@.iam.gserviceaccount.com` -``` +::: Connect to your TPU using SSH: @@ -178,15 +178,15 @@ Run the Docker image with the following command: docker run --privileged --net host --shm-size=16G -it vllm-tpu ``` -```{note} +:::{note} Since TPU relies on XLA which requires static shapes, vLLM bucketizes the possible input shapes and compiles an XLA graph for each shape. The compilation time may take 20~30 minutes in the first run. However, the compilation time reduces to ~5 minutes afterwards because the XLA graphs are cached in the disk (in {code}`VLLM_XLA_CACHE_PATH` or {code}`~/.cache/vllm/xla_cache` by default). -``` +::: -````{tip} +:::{tip} If you encounter the following error: ```console @@ -198,9 +198,10 @@ file or directory Install OpenBLAS with the following command: ```console -$ sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev +sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev ``` -```` + +::: ## Extra information diff --git a/docs/source/getting_started/installation/cpu/apple.inc.md b/docs/source/getting_started/installation/cpu/apple.inc.md index 56545253b1ef7..0808b869fdb7b 100644 --- a/docs/source/getting_started/installation/cpu/apple.inc.md +++ b/docs/source/getting_started/installation/cpu/apple.inc.md @@ -25,9 +25,9 @@ pip install -r requirements-cpu.txt pip install -e . ``` -```{note} +:::{note} On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device. -``` +::: #### Troubleshooting diff --git a/docs/source/getting_started/installation/cpu/index.md b/docs/source/getting_started/installation/cpu/index.md index 4ec907c0e9fda..2f549ede0cf48 100644 --- a/docs/source/getting_started/installation/cpu/index.md +++ b/docs/source/getting_started/installation/cpu/index.md @@ -2,86 +2,86 @@ vLLM is a Python library that supports the following CPU variants. Select your CPU type to see vendor specific instructions: -::::{tab-set} +:::::{tab-set} :sync-group: device -:::{tab-item} x86 +::::{tab-item} x86 :sync: x86 -```{include} x86.inc.md +:::{include} x86.inc.md :start-after: "# Installation" :end-before: "## Requirements" -``` - ::: -:::{tab-item} ARM +:::: + +::::{tab-item} ARM :sync: arm -```{include} arm.inc.md +:::{include} arm.inc.md :start-after: "# Installation" :end-before: "## Requirements" -``` - ::: -:::{tab-item} Apple silicon +:::: + +::::{tab-item} Apple silicon :sync: apple -```{include} apple.inc.md +:::{include} apple.inc.md :start-after: "# Installation" :end-before: "## Requirements" -``` - ::: :::: +::::: + ## Requirements - Python: 3.9 -- 3.12 -::::{tab-set} +:::::{tab-set} :sync-group: device -:::{tab-item} x86 +::::{tab-item} x86 :sync: x86 -```{include} x86.inc.md +:::{include} x86.inc.md :start-after: "## Requirements" :end-before: "## Set up using Python" -``` - ::: -:::{tab-item} ARM +:::: + +::::{tab-item} ARM :sync: arm -```{include} arm.inc.md +:::{include} arm.inc.md :start-after: "## Requirements" :end-before: "## Set up using Python" -``` - ::: -:::{tab-item} Apple silicon +:::: + +::::{tab-item} Apple silicon :sync: apple -```{include} apple.inc.md +:::{include} apple.inc.md :start-after: "## Requirements" :end-before: "## Set up using Python" -``` - ::: :::: +::::: + ## Set up using Python ### Create a new Python environment -```{include} ../python_env_setup.inc.md -``` +:::{include} ../python_env_setup.inc.md +::: ### Pre-built wheels @@ -89,41 +89,41 @@ Currently, there are no pre-built CPU wheels. ### Build wheel from source -::::{tab-set} +:::::{tab-set} :sync-group: device -:::{tab-item} x86 +::::{tab-item} x86 :sync: x86 -```{include} x86.inc.md +:::{include} x86.inc.md :start-after: "### Build wheel from source" :end-before: "## Set up using Docker" -``` - ::: -:::{tab-item} ARM +:::: + +::::{tab-item} ARM :sync: arm -```{include} arm.inc.md +:::{include} arm.inc.md :start-after: "### Build wheel from source" :end-before: "## Set up using Docker" -``` - ::: -:::{tab-item} Apple silicon +:::: + +::::{tab-item} Apple silicon :sync: apple -```{include} apple.inc.md +:::{include} apple.inc.md :start-after: "### Build wheel from source" :end-before: "## Set up using Docker" -``` - ::: :::: +::::: + ## Set up using Docker ### Pre-built images @@ -142,9 +142,9 @@ $ docker run -it \ vllm-cpu-env ``` -:::{tip} +::::{tip} For ARM or Apple silicon, use `Dockerfile.arm` -::: +:::: ## Supported features diff --git a/docs/source/getting_started/installation/cpu/x86.inc.md b/docs/source/getting_started/installation/cpu/x86.inc.md index e0eaac5099305..f146ae0918b44 100644 --- a/docs/source/getting_started/installation/cpu/x86.inc.md +++ b/docs/source/getting_started/installation/cpu/x86.inc.md @@ -17,10 +17,10 @@ vLLM initially supports basic model inferencing and serving on x86 CPU platform, :::{include} build.inc.md ::: -```{note} +:::{note} - AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, which brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16. - If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable `VLLM_CPU_AVX512BF16=1` before the building. -``` +::: ## Set up using Docker diff --git a/docs/source/getting_started/installation/gpu/cuda.inc.md b/docs/source/getting_started/installation/gpu/cuda.inc.md index 4cce65278c069..5c2ea30dbfde1 100644 --- a/docs/source/getting_started/installation/gpu/cuda.inc.md +++ b/docs/source/getting_started/installation/gpu/cuda.inc.md @@ -10,9 +10,9 @@ vLLM contains pre-compiled C++ and CUDA (12.1) binaries. ### Create a new Python environment -```{note} +:::{note} PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See for more details. -``` +::: In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations. @@ -100,10 +100,10 @@ pip install --editable . You can find more information about vLLM's wheels in . -```{note} +:::{note} There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors. It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to for instructions on how to install a specified wheel. -``` +::: #### Full build (with compilation) @@ -115,7 +115,7 @@ cd vllm pip install -e . ``` -```{tip} +:::{tip} Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results. For example, you can install [ccache](https://github.com/ccache/ccache) using `conda install ccache` or `apt install ccache` . @@ -123,7 +123,7 @@ As long as `which ccache` command can find the `ccache` binary, it will be used [sccache](https://github.com/mozilla/sccache) works similarly to `ccache`, but has the capability to utilize caching in remote storage environments. The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`. -``` +::: ##### Use an existing PyTorch installation diff --git a/docs/source/getting_started/installation/gpu/index.md b/docs/source/getting_started/installation/gpu/index.md index 6c007382b2c3d..0a61f889753a3 100644 --- a/docs/source/getting_started/installation/gpu/index.md +++ b/docs/source/getting_started/installation/gpu/index.md @@ -2,299 +2,299 @@ vLLM is a Python library that supports the following GPU variants. Select your GPU type to see vendor specific instructions: -::::{tab-set} +:::::{tab-set} :sync-group: device -:::{tab-item} CUDA +::::{tab-item} CUDA :sync: cuda -```{include} cuda.inc.md +:::{include} cuda.inc.md :start-after: "# Installation" :end-before: "## Requirements" -``` - ::: -:::{tab-item} ROCm +:::: + +::::{tab-item} ROCm :sync: rocm -```{include} rocm.inc.md +:::{include} rocm.inc.md :start-after: "# Installation" :end-before: "## Requirements" -``` - ::: -:::{tab-item} XPU +:::: + +::::{tab-item} XPU :sync: xpu -```{include} xpu.inc.md +:::{include} xpu.inc.md :start-after: "# Installation" :end-before: "## Requirements" -``` - ::: :::: +::::: + ## Requirements - OS: Linux - Python: 3.9 -- 3.12 -::::{tab-set} +:::::{tab-set} :sync-group: device -:::{tab-item} CUDA +::::{tab-item} CUDA :sync: cuda -```{include} cuda.inc.md +:::{include} cuda.inc.md :start-after: "## Requirements" :end-before: "## Set up using Python" -``` - ::: -:::{tab-item} ROCm +:::: + +::::{tab-item} ROCm :sync: rocm -```{include} rocm.inc.md +:::{include} rocm.inc.md :start-after: "## Requirements" :end-before: "## Set up using Python" -``` - ::: -:::{tab-item} XPU +:::: + +::::{tab-item} XPU :sync: xpu -```{include} xpu.inc.md +:::{include} xpu.inc.md :start-after: "## Requirements" :end-before: "## Set up using Python" -``` - ::: :::: +::::: + ## Set up using Python ### Create a new Python environment -```{include} ../python_env_setup.inc.md -``` +:::{include} ../python_env_setup.inc.md +::: -::::{tab-set} +:::::{tab-set} :sync-group: device -:::{tab-item} CUDA +::::{tab-item} CUDA :sync: cuda -```{include} cuda.inc.md +:::{include} cuda.inc.md :start-after: "## Create a new Python environment" :end-before: "### Pre-built wheels" -``` - ::: -:::{tab-item} ROCm +:::: + +::::{tab-item} ROCm :sync: rocm There is no extra information on creating a new Python environment for this device. -::: +:::: -:::{tab-item} XPU +::::{tab-item} XPU :sync: xpu There is no extra information on creating a new Python environment for this device. -::: - :::: +::::: + ### Pre-built wheels -::::{tab-set} +:::::{tab-set} :sync-group: device -:::{tab-item} CUDA +::::{tab-item} CUDA :sync: cuda -```{include} cuda.inc.md +:::{include} cuda.inc.md :start-after: "### Pre-built wheels" :end-before: "### Build wheel from source" -``` - ::: -:::{tab-item} ROCm +:::: + +::::{tab-item} ROCm :sync: rocm -```{include} rocm.inc.md +:::{include} rocm.inc.md :start-after: "### Pre-built wheels" :end-before: "### Build wheel from source" -``` - ::: -:::{tab-item} XPU +:::: + +::::{tab-item} XPU :sync: xpu -```{include} xpu.inc.md +:::{include} xpu.inc.md :start-after: "### Pre-built wheels" :end-before: "### Build wheel from source" -``` - ::: :::: +::::: + (build-from-source)= ### Build wheel from source -::::{tab-set} +:::::{tab-set} :sync-group: device -:::{tab-item} CUDA +::::{tab-item} CUDA :sync: cuda -```{include} cuda.inc.md +:::{include} cuda.inc.md :start-after: "### Build wheel from source" :end-before: "## Set up using Docker" -``` - ::: -:::{tab-item} ROCm +:::: + +::::{tab-item} ROCm :sync: rocm -```{include} rocm.inc.md +:::{include} rocm.inc.md :start-after: "### Build wheel from source" :end-before: "## Set up using Docker" -``` - ::: -:::{tab-item} XPU +:::: + +::::{tab-item} XPU :sync: xpu -```{include} xpu.inc.md +:::{include} xpu.inc.md :start-after: "### Build wheel from source" :end-before: "## Set up using Docker" -``` - ::: :::: +::::: + ## Set up using Docker ### Pre-built images -::::{tab-set} +:::::{tab-set} :sync-group: device -:::{tab-item} CUDA +::::{tab-item} CUDA :sync: cuda -```{include} cuda.inc.md +:::{include} cuda.inc.md :start-after: "### Pre-built images" :end-before: "### Build image from source" -``` - ::: -:::{tab-item} ROCm +:::: + +::::{tab-item} ROCm :sync: rocm -```{include} rocm.inc.md +:::{include} rocm.inc.md :start-after: "### Pre-built images" :end-before: "### Build image from source" -``` - ::: -:::{tab-item} XPU +:::: + +::::{tab-item} XPU :sync: xpu -```{include} xpu.inc.md +:::{include} xpu.inc.md :start-after: "### Pre-built images" :end-before: "### Build image from source" -``` - ::: :::: +::::: + ### Build image from source -::::{tab-set} +:::::{tab-set} :sync-group: device -:::{tab-item} CUDA +::::{tab-item} CUDA :sync: cuda -```{include} cuda.inc.md +:::{include} cuda.inc.md :start-after: "### Build image from source" :end-before: "## Supported features" -``` - ::: -:::{tab-item} ROCm +:::: + +::::{tab-item} ROCm :sync: rocm -```{include} rocm.inc.md +:::{include} rocm.inc.md :start-after: "### Build image from source" :end-before: "## Supported features" -``` - ::: -:::{tab-item} XPU +:::: + +::::{tab-item} XPU :sync: xpu -```{include} xpu.inc.md +:::{include} xpu.inc.md :start-after: "### Build image from source" :end-before: "## Supported features" -``` - ::: :::: +::::: + ## Supported features -::::{tab-set} +:::::{tab-set} :sync-group: device -:::{tab-item} CUDA +::::{tab-item} CUDA :sync: cuda -```{include} cuda.inc.md +:::{include} cuda.inc.md :start-after: "## Supported features" -``` - ::: -:::{tab-item} ROCm +:::: + +::::{tab-item} ROCm :sync: rocm -```{include} rocm.inc.md +:::{include} rocm.inc.md :start-after: "## Supported features" -``` - ::: -:::{tab-item} XPU +:::: + +::::{tab-item} XPU :sync: xpu -```{include} xpu.inc.md +:::{include} xpu.inc.md :start-after: "## Supported features" -``` - ::: :::: + +::::: diff --git a/docs/source/getting_started/installation/gpu/rocm.inc.md b/docs/source/getting_started/installation/gpu/rocm.inc.md index 69238f6e36fb2..131ad1704ea11 100644 --- a/docs/source/getting_started/installation/gpu/rocm.inc.md +++ b/docs/source/getting_started/installation/gpu/rocm.inc.md @@ -16,10 +16,10 @@ Currently, there are no pre-built ROCm wheels. However, the [AMD Infinity hub for vLLM](https://hub.docker.com/r/rocm/vllm/tags) offers a prebuilt, optimized docker image designed for validating inference performance on the AMD Instinct™ MI300X accelerator. -```{tip} +:::{tip} Please check [LLM inference performance validation on AMD Instinct MI300X](https://rocm.docs.amd.com/en/latest/how-to/performance-validation/mi300x/vllm-benchmark.html) for instructions on how to use this prebuilt docker image. -``` +::: ### Build wheel from source @@ -47,9 +47,9 @@ for instructions on how to use this prebuilt docker image. cd ../.. ``` - ```{note} - - If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent. - ``` + :::{note} + If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent. + ::: 2. Optionally, if you choose to use CK flash attention, you can install [flash attention for ROCm](https://github.com/ROCm/flash-attention/tree/ck_tile) @@ -67,9 +67,9 @@ for instructions on how to use this prebuilt docker image. cd .. ``` - ```{note} - - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) - ``` + :::{note} + You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) + ::: 3. Build vLLM. For example, vLLM on ROCM 6.2 can be built with the following steps: @@ -95,17 +95,18 @@ for instructions on how to use this prebuilt docker image. This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation. - ```{tip} + + :::{tip} - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers. - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support. - To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention. - The ROCm version of PyTorch, ideally, should match the ROCm driver version. - ``` + ::: -```{tip} +:::{tip} - For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level. For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization). -``` +::: ## Set up using Docker diff --git a/docs/source/getting_started/installation/gpu/xpu.inc.md b/docs/source/getting_started/installation/gpu/xpu.inc.md index 577986eba74fd..bc01c6000bc07 100644 --- a/docs/source/getting_started/installation/gpu/xpu.inc.md +++ b/docs/source/getting_started/installation/gpu/xpu.inc.md @@ -30,10 +30,10 @@ pip install -v -r requirements-xpu.txt VLLM_TARGET_DEVICE=xpu python setup.py install ``` -```{note} +:::{note} - FP16 is the default data type in the current XPU backend. The BF16 data type will be supported in the future. -``` +::: ## Set up using Docker diff --git a/docs/source/getting_started/installation/index.md b/docs/source/getting_started/installation/index.md index bc1d268bf0c7e..0f5e013ce071a 100644 --- a/docs/source/getting_started/installation/index.md +++ b/docs/source/getting_started/installation/index.md @@ -4,10 +4,10 @@ vLLM supports the following hardware platforms: -```{toctree} +:::{toctree} :maxdepth: 1 gpu/index cpu/index ai_accelerator/index -``` +::: diff --git a/docs/source/getting_started/installation/python_env_setup.inc.md b/docs/source/getting_started/installation/python_env_setup.inc.md index 25cfac5f58aa7..cb73914c9c75e 100644 --- a/docs/source/getting_started/installation/python_env_setup.inc.md +++ b/docs/source/getting_started/installation/python_env_setup.inc.md @@ -6,9 +6,9 @@ conda create -n myenv python=3.12 -y conda activate myenv ``` -```{note} +:::{note} [PyTorch has deprecated the conda release channel](https://github.com/pytorch/pytorch/issues/138506). If you use `conda`, please only use it to create Python environment rather than installing packages. -``` +::: Or you can create a new Python environment using [uv](https://docs.astral.sh/uv/), a very fast Python environment manager. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following command: diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md index 8ac80e5e5c553..f4682ee45a48e 100644 --- a/docs/source/getting_started/quickstart.md +++ b/docs/source/getting_started/quickstart.md @@ -32,9 +32,9 @@ conda activate myenv pip install vllm ``` -```{note} +:::{note} For non-CUDA platforms, please refer [here](#installation-index) for specific instructions on how to install vLLM. -``` +::: (quickstart-offline)= @@ -69,9 +69,9 @@ The {class}`~vllm.LLM` class initializes vLLM's engine and the [OPT-125M model]( llm = LLM(model="facebook/opt-125m") ``` -```{note} +:::{note} By default, vLLM downloads models from [HuggingFace](https://huggingface.co/). If you would like to use models from [ModelScope](https://www.modelscope.cn), set the environment variable `VLLM_USE_MODELSCOPE` before initializing the engine. -``` +::: Now, the fun part! The outputs are generated using `llm.generate`. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of `RequestOutput` objects, which include all of the output tokens. @@ -97,10 +97,10 @@ Run the following command to start the vLLM server with the [Qwen2.5-1.5B-Instru vllm serve Qwen/Qwen2.5-1.5B-Instruct ``` -```{note} +:::{note} By default, the server uses a predefined chat template stored in the tokenizer. You can learn about overriding it [here](#chat-template). -``` +::: This server can be queried in the same format as OpenAI API. For example, to list the models: diff --git a/docs/source/getting_started/troubleshooting.md b/docs/source/getting_started/troubleshooting.md index 7bfe9b4036adf..2f41fa3b6b19e 100644 --- a/docs/source/getting_started/troubleshooting.md +++ b/docs/source/getting_started/troubleshooting.md @@ -4,9 +4,9 @@ This document outlines some troubleshooting strategies you can consider. If you think you've discovered a bug, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. -```{note} +:::{note} Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated. -``` +::: ## Hangs downloading a model @@ -18,9 +18,9 @@ It's recommended to download the model first using the [huggingface-cli](https:/ If the model is large, it can take a long time to load it from disk. Pay attention to where you store the model. Some clusters have shared filesystems across nodes, e.g. a distributed filesystem or a network filesystem, which can be slow. It'd be better to store the model in a local disk. Additionally, have a look at the CPU memory usage, when the model is too large it might take a lot of CPU memory, slowing down the operating system because it needs to frequently swap between disk and memory. -```{note} +:::{note} To isolate the model downloading and loading issue, you can use the `--load-format dummy` argument to skip loading the model weights. This way, you can check if the model downloading and loading is the bottleneck. -``` +::: ## Out of memory @@ -132,14 +132,14 @@ If the script runs successfully, you should see the message `sanity check is suc If the test script hangs or crashes, usually it means the hardware/drivers are broken in some sense. You should try to contact your system administrator or hardware vendor for further assistance. As a common workaround, you can try to tune some NCCL environment variables, such as `export NCCL_P2P_DISABLE=1` to see if it helps. Please check [their documentation](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html) for more information. Please only use these environment variables as a temporary workaround, as they might affect the performance of the system. The best solution is still to fix the hardware/drivers so that the test script can run successfully. -```{note} +:::{note} A multi-node environment is more complicated than a single-node one. If you see errors such as `torch.distributed.DistNetworkError`, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments: - In the first node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 0 --master_addr $MASTER_ADDR test.py`. - In the second node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 1 --master_addr $MASTER_ADDR test.py`. Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup, being sure to execute different commands (with different `--node-rank`) on different nodes. -``` +::: (troubleshooting-python-multiprocessing)= diff --git a/docs/source/index.md b/docs/source/index.md index 6957d5dd0f2e7..e90e81c72860a 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -1,13 +1,13 @@ # Welcome to vLLM -```{figure} ./assets/logos/vllm-logo-text-light.png +:::{figure} ./assets/logos/vllm-logo-text-light.png :align: center :alt: vLLM :class: no-scaled-link :width: 60% -``` +::: -```{raw} html +:::{raw} html

Easy, fast, and cheap LLM serving for everyone @@ -19,7 +19,7 @@ Watch Fork

-``` +::: vLLM is a fast and easy-to-use library for LLM inference and serving. @@ -58,7 +58,7 @@ For more information, check out the following: % How to start using vLLM? -```{toctree} +:::{toctree} :caption: Getting Started :maxdepth: 1 @@ -67,11 +67,11 @@ getting_started/quickstart getting_started/examples/examples_index getting_started/troubleshooting getting_started/faq -``` +::: % What does vLLM support? -```{toctree} +:::{toctree} :caption: Models :maxdepth: 1 @@ -79,11 +79,11 @@ models/generative_models models/pooling_models models/supported_models models/extensions/index -``` +::: % Additional capabilities -```{toctree} +:::{toctree} :caption: Features :maxdepth: 1 @@ -96,11 +96,11 @@ features/automatic_prefix_caching features/disagg_prefill features/spec_decode features/compatibility_matrix -``` +::: % Details about running vLLM -```{toctree} +:::{toctree} :caption: Inference and Serving :maxdepth: 1 @@ -113,11 +113,11 @@ serving/engine_args serving/env_vars serving/usage_stats serving/integrations/index -``` +::: % Scaling up vLLM for production -```{toctree} +:::{toctree} :caption: Deployment :maxdepth: 1 @@ -126,21 +126,21 @@ deployment/k8s deployment/nginx deployment/frameworks/index deployment/integrations/index -``` +::: % Making the most out of vLLM -```{toctree} +:::{toctree} :caption: Performance :maxdepth: 1 performance/optimization performance/benchmarks -``` +::: % Explanation of vLLM internals -```{toctree} +:::{toctree} :caption: Design Documents :maxdepth: 2 @@ -151,11 +151,11 @@ design/kernel/paged_attention design/mm_processing design/automatic_prefix_caching design/multiprocessing -``` +::: % How to contribute to the vLLM project -```{toctree} +:::{toctree} :caption: Developer Guide :maxdepth: 2 @@ -164,11 +164,11 @@ contributing/profiling/profiling_index contributing/dockerfile/dockerfile contributing/model/index contributing/vulnerability_management -``` +::: % Technical API specifications -```{toctree} +:::{toctree} :caption: API Reference :maxdepth: 2 @@ -177,18 +177,18 @@ api/engine/index api/inference_params api/multimodal/index api/model/index -``` +::: % Latest news and acknowledgements -```{toctree} +:::{toctree} :caption: Community :maxdepth: 1 community/blog community/meetups community/sponsors -``` +::: ## Indices and tables diff --git a/docs/source/models/extensions/index.md b/docs/source/models/extensions/index.md index cff09d12eba47..69faf472e5300 100644 --- a/docs/source/models/extensions/index.md +++ b/docs/source/models/extensions/index.md @@ -1,8 +1,8 @@ # Built-in Extensions -```{toctree} +:::{toctree} :maxdepth: 1 runai_model_streamer tensorizer -``` +::: diff --git a/docs/source/models/extensions/runai_model_streamer.md b/docs/source/models/extensions/runai_model_streamer.md index 75f7a9fcad416..99c37876a01b3 100644 --- a/docs/source/models/extensions/runai_model_streamer.md +++ b/docs/source/models/extensions/runai_model_streamer.md @@ -48,6 +48,6 @@ You can read further about CPU buffer memory limiting [here](https://github.com/ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"memory_limit":5368709120}' ``` -```{note} +:::{note} For further instructions about tunable parameters and additional parameters configurable through environment variables, read the [Environment Variables Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md). -``` +::: diff --git a/docs/source/models/extensions/tensorizer.md b/docs/source/models/extensions/tensorizer.md index ae17e3437bca6..830c579d91bae 100644 --- a/docs/source/models/extensions/tensorizer.md +++ b/docs/source/models/extensions/tensorizer.md @@ -11,6 +11,6 @@ For more information on CoreWeave's Tensorizer, please refer to [CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see the [vLLM example script](https://docs.vllm.ai/en/stable/getting_started/examples/offline_inference/tensorize_vllm_model.html). -```{note} +:::{note} Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`. -``` +::: diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md index e4b4cd03a90d2..4abe6b776eea3 100644 --- a/docs/source/models/generative_models.md +++ b/docs/source/models/generative_models.md @@ -70,10 +70,10 @@ The {class}`~vllm.LLM.chat` method implements chat functionality on top of {clas In particular, it accepts input similar to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat) and automatically applies the model's [chat template](https://huggingface.co/docs/transformers/en/chat_templating) to format the prompt. -```{important} +:::{important} In general, only instruction-tuned models have a chat template. Base models may perform poorly as they are not trained to respond to the chat conversation. -``` +::: ```python llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct") diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md index 91db694be29a4..9704ccee745c4 100644 --- a/docs/source/models/pooling_models.md +++ b/docs/source/models/pooling_models.md @@ -8,54 +8,54 @@ In vLLM, pooling models implement the {class}`~vllm.model_executor.models.VllmMo These models use a {class}`~vllm.model_executor.layers.Pooler` to extract the final hidden states of the input before returning them. -```{note} +:::{note} We currently support pooling models primarily as a matter of convenience. As shown in the [Compatibility Matrix](#compatibility-matrix), most vLLM features are not applicable to pooling models as they only work on the generation or decode stage, so performance may not improve as much. -``` +::: For pooling models, we support the following `--task` options. The selected option sets the default pooler used to extract the final hidden states: -```{list-table} +:::{list-table} :widths: 50 25 25 25 :header-rows: 1 -* - Task - - Pooling Type - - Normalization - - Softmax -* - Embedding (`embed`) - - `LAST` - - ✅︎ - - ✗ -* - Classification (`classify`) - - `LAST` - - ✗ - - ✅︎ -* - Sentence Pair Scoring (`score`) - - \* - - \* - - \* -* - Reward Modeling (`reward`) - - `ALL` - - ✗ - - ✗ -``` +- * Task + * Pooling Type + * Normalization + * Softmax +- * Embedding (`embed`) + * `LAST` + * ✅︎ + * ✗ +- * Classification (`classify`) + * `LAST` + * ✗ + * ✅︎ +- * Sentence Pair Scoring (`score`) + * \* + * \* + * \* +- * Reward Modeling (`reward`) + * `ALL` + * ✗ + * ✗ +::: \*The default pooler is always defined by the model. -```{note} +:::{note} If the model's implementation in vLLM defines its own pooler, the default pooler is set to that instead of the one specified in this table. -``` +::: When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models, we attempt to override the default pooler based on its Sentence Transformers configuration file (`modules.json`). -```{tip} +:::{tip} You can customize the model's pooling method via the `--override-pooler-config` option, which takes priority over both the model's and Sentence Transformers's defaults. -``` +::: ## Offline Inference @@ -111,10 +111,10 @@ The {class}`~vllm.LLM.score` method outputs similarity scores between sentence p It is primarily designed for [cross-encoder models](https://www.sbert.net/examples/applications/cross-encoder/README.html). These types of models serve as rerankers between candidate query-document pairs in RAG systems. -```{note} +:::{note} vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG. To handle RAG at a higher level, you should use integration frameworks such as [LangChain](https://github.com/langchain-ai/langchain). -``` +::: ```python llm = LLM(model="BAAI/bge-reranker-v2-m3", task="score") diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index e59150cdd3b83..94f4bd6cadabd 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -17,7 +17,7 @@ By default, vLLM loads models from [HuggingFace (HF) Hub](https://huggingface.co To determine whether a given model is supported, you can check the `config.json` file inside the HF repository. If the `"architectures"` field contains a model architecture listed below, then it should be supported in theory. -````{tip} +:::{tip} The easiest way to check if your model is really supported at runtime is to run the program below: ```python @@ -35,7 +35,7 @@ print(output) ``` If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported. -```` +::: Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM. Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support. @@ -72,364 +72,364 @@ See [this page](#generative-models) for more information on how to use generativ #### Text Generation (`--task generate`) -```{list-table} +:::{list-table} :widths: 25 25 50 5 5 :header-rows: 1 -* - Architecture - - Models - - Example HF Models - - [LoRA](#lora-adapter) - - [PP](#distributed-serving) -* - `AquilaForCausalLM` - - Aquila, Aquila2 - - `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. - - ✅︎ - - ✅︎ -* - `ArcticForCausalLM` - - Arctic - - `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. - - - - ✅︎ -* - `BaiChuanForCausalLM` - - Baichuan2, Baichuan - - `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. - - ✅︎ - - ✅︎ -* - `BloomForCausalLM` - - BLOOM, BLOOMZ, BLOOMChat - - `bigscience/bloom`, `bigscience/bloomz`, etc. - - - - ✅︎ -* - `BartForConditionalGeneration` - - BART - - `facebook/bart-base`, `facebook/bart-large-cnn`, etc. - - - - -* - `ChatGLMModel` - - ChatGLM - - `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc. - - ✅︎ - - ✅︎ -* - `CohereForCausalLM`, `Cohere2ForCausalLM` - - Command-R - - `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc. - - ✅︎ - - ✅︎ -* - `DbrxForCausalLM` - - DBRX - - `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. - - - - ✅︎ -* - `DeciLMForCausalLM` - - DeciLM - - `Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc. - - - - ✅︎ -* - `DeepseekForCausalLM` - - DeepSeek - - `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat` etc. - - - - ✅︎ -* - `DeepseekV2ForCausalLM` - - DeepSeek-V2 - - `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat` etc. - - - - ✅︎ -* - `DeepseekV3ForCausalLM` - - DeepSeek-V3 - - `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3` etc. - - - - ✅︎ -* - `ExaoneForCausalLM` - - EXAONE-3 - - `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. - - ✅︎ - - ✅︎ -* - `FalconForCausalLM` - - Falcon - - `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc. - - - - ✅︎ -* - `FalconMambaForCausalLM` - - FalconMamba - - `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc. - - ✅︎ - - ✅︎ -* - `GemmaForCausalLM` - - Gemma - - `google/gemma-2b`, `google/gemma-7b`, etc. - - ✅︎ - - ✅︎ -* - `Gemma2ForCausalLM` - - Gemma2 - - `google/gemma-2-9b`, `google/gemma-2-27b`, etc. - - ✅︎ - - ✅︎ -* - `GlmForCausalLM` - - GLM-4 - - `THUDM/glm-4-9b-chat-hf`, etc. - - ✅︎ - - ✅︎ -* - `GPT2LMHeadModel` - - GPT-2 - - `gpt2`, `gpt2-xl`, etc. - - - - ✅︎ -* - `GPTBigCodeForCausalLM` - - StarCoder, SantaCoder, WizardCoder - - `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. - - ✅︎ - - ✅︎ -* - `GPTJForCausalLM` - - GPT-J - - `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. - - - - ✅︎ -* - `GPTNeoXForCausalLM` - - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM - - `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. - - - - ✅︎ -* - `GraniteForCausalLM` - - Granite 3.0, Granite 3.1, PowerLM - - `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc. - - ✅︎ - - ✅︎ -* - `GraniteMoeForCausalLM` - - Granite 3.0 MoE, PowerMoE - - `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. - - ✅︎ - - ✅︎ -* - `GritLM` - - GritLM - - `parasail-ai/GritLM-7B-vllm`. - - ✅︎ - - ✅︎ -* - `InternLMForCausalLM` - - InternLM - - `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. - - ✅︎ - - ✅︎ -* - `InternLM2ForCausalLM` - - InternLM2 - - `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. - - ✅︎ - - ✅︎ -* - `InternLM3ForCausalLM` - - InternLM3 - - `internlm/internlm3-8b-instruct`, etc. - - ✅︎ - - ✅︎ -* - `JAISLMHeadModel` - - Jais - - `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. - - - - ✅︎ -* - `JambaForCausalLM` - - Jamba - - `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. - - ✅︎ - - ✅︎ -* - `LlamaForCausalLM` - - Llama 3.1, Llama 3, Llama 2, LLaMA, Yi - - `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. - - ✅︎ - - ✅︎ -* - `MambaForCausalLM` - - Mamba - - `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. - - - - ✅︎ -* - `MiniCPMForCausalLM` - - MiniCPM - - `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. - - ✅︎ - - ✅︎ -* - `MiniCPM3ForCausalLM` - - MiniCPM3 - - `openbmb/MiniCPM3-4B`, etc. - - ✅︎ - - ✅︎ -* - `MistralForCausalLM` - - Mistral, Mistral-Instruct - - `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. - - ✅︎ - - ✅︎ -* - `MixtralForCausalLM` - - Mixtral-8x7B, Mixtral-8x7B-Instruct - - `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. - - ✅︎ - - ✅︎ -* - `MPTForCausalLM` - - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter - - `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. - - - - ✅︎ -* - `NemotronForCausalLM` - - Nemotron-3, Nemotron-4, Minitron - - `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. - - ✅︎ - - ✅︎ -* - `OLMoForCausalLM` - - OLMo - - `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. - - - - ✅︎ -* - `OLMo2ForCausalLM` - - OLMo2 - - `allenai/OLMo2-7B-1124`, etc. - - - - ✅︎ -* - `OLMoEForCausalLM` - - OLMoE - - `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. - - ✅︎ - - ✅︎ -* - `OPTForCausalLM` - - OPT, OPT-IML - - `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. - - - - ✅︎ -* - `OrionForCausalLM` - - Orion - - `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. - - - - ✅︎ -* - `PhiForCausalLM` - - Phi - - `microsoft/phi-1_5`, `microsoft/phi-2`, etc. - - ✅︎ - - ✅︎ -* - `Phi3ForCausalLM` - - Phi-4, Phi-3 - - `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. - - ✅︎ - - ✅︎ -* - `Phi3SmallForCausalLM` - - Phi-3-Small - - `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc. - - - - ✅︎ -* - `PhiMoEForCausalLM` - - Phi-3.5-MoE - - `microsoft/Phi-3.5-MoE-instruct`, etc. - - ✅︎ - - ✅︎ -* - `PersimmonForCausalLM` - - Persimmon - - `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. - - - - ✅︎ -* - `QWenLMHeadModel` - - Qwen - - `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. - - ✅︎ - - ✅︎ -* - `Qwen2ForCausalLM` - - QwQ, Qwen2 - - `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. - - ✅︎ - - ✅︎ -* - `Qwen2MoeForCausalLM` - - Qwen2MoE - - `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. - - - - ✅︎ -* - `StableLmForCausalLM` - - StableLM - - `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. - - - - ✅︎ -* - `Starcoder2ForCausalLM` - - Starcoder2 - - `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. - - - - ✅︎ -* - `SolarForCausalLM` - - Solar Pro - - `upstage/solar-pro-preview-instruct`, etc. - - ✅︎ - - ✅︎ -* - `TeleChat2ForCausalLM` - - TeleChat2 - - `TeleAI/TeleChat2-3B`, `TeleAI/TeleChat2-7B`, `TeleAI/TeleChat2-35B`, etc. - - ✅︎ - - ✅︎ -* - `XverseForCausalLM` - - XVERSE - - `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. - - ✅︎ - - ✅︎ -``` - -```{note} +- * Architecture + * Models + * Example HF Models + * [LoRA](#lora-adapter) + * [PP](#distributed-serving) +- * `AquilaForCausalLM` + * Aquila, Aquila2 + * `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. + * ✅︎ + * ✅︎ +- * `ArcticForCausalLM` + * Arctic + * `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. + * + * ✅︎ +- * `BaiChuanForCausalLM` + * Baichuan2, Baichuan + * `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. + * ✅︎ + * ✅︎ +- * `BloomForCausalLM` + * BLOOM, BLOOMZ, BLOOMChat + * `bigscience/bloom`, `bigscience/bloomz`, etc. + * + * ✅︎ +- * `BartForConditionalGeneration` + * BART + * `facebook/bart-base`, `facebook/bart-large-cnn`, etc. + * + * +- * `ChatGLMModel` + * ChatGLM + * `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc. + * ✅︎ + * ✅︎ +- * `CohereForCausalLM`, `Cohere2ForCausalLM` + * Command-R + * `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc. + * ✅︎ + * ✅︎ +- * `DbrxForCausalLM` + * DBRX + * `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. + * + * ✅︎ +- * `DeciLMForCausalLM` + * DeciLM + * `Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc. + * + * ✅︎ +- * `DeepseekForCausalLM` + * DeepSeek + * `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat` etc. + * + * ✅︎ +- * `DeepseekV2ForCausalLM` + * DeepSeek-V2 + * `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat` etc. + * + * ✅︎ +- * `DeepseekV3ForCausalLM` + * DeepSeek-V3 + * `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3` etc. + * + * ✅︎ +- * `ExaoneForCausalLM` + * EXAONE-3 + * `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. + * ✅︎ + * ✅︎ +- * `FalconForCausalLM` + * Falcon + * `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc. + * + * ✅︎ +- * `FalconMambaForCausalLM` + * FalconMamba + * `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc. + * ✅︎ + * ✅︎ +- * `GemmaForCausalLM` + * Gemma + * `google/gemma-2b`, `google/gemma-7b`, etc. + * ✅︎ + * ✅︎ +- * `Gemma2ForCausalLM` + * Gemma2 + * `google/gemma-2-9b`, `google/gemma-2-27b`, etc. + * ✅︎ + * ✅︎ +- * `GlmForCausalLM` + * GLM-4 + * `THUDM/glm-4-9b-chat-hf`, etc. + * ✅︎ + * ✅︎ +- * `GPT2LMHeadModel` + * GPT-2 + * `gpt2`, `gpt2-xl`, etc. + * + * ✅︎ +- * `GPTBigCodeForCausalLM` + * StarCoder, SantaCoder, WizardCoder + * `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. + * ✅︎ + * ✅︎ +- * `GPTJForCausalLM` + * GPT-J + * `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. + * + * ✅︎ +- * `GPTNeoXForCausalLM` + * GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM + * `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. + * + * ✅︎ +- * `GraniteForCausalLM` + * Granite 3.0, Granite 3.1, PowerLM + * `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc. + * ✅︎ + * ✅︎ +- * `GraniteMoeForCausalLM` + * Granite 3.0 MoE, PowerMoE + * `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. + * ✅︎ + * ✅︎ +- * `GritLM` + * GritLM + * `parasail-ai/GritLM-7B-vllm`. + * ✅︎ + * ✅︎ +- * `InternLMForCausalLM` + * InternLM + * `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. + * ✅︎ + * ✅︎ +- * `InternLM2ForCausalLM` + * InternLM2 + * `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. + * ✅︎ + * ✅︎ +- * `InternLM3ForCausalLM` + * InternLM3 + * `internlm/internlm3-8b-instruct`, etc. + * ✅︎ + * ✅︎ +- * `JAISLMHeadModel` + * Jais + * `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. + * + * ✅︎ +- * `JambaForCausalLM` + * Jamba + * `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. + * ✅︎ + * ✅︎ +- * `LlamaForCausalLM` + * Llama 3.1, Llama 3, Llama 2, LLaMA, Yi + * `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. + * ✅︎ + * ✅︎ +- * `MambaForCausalLM` + * Mamba + * `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. + * + * ✅︎ +- * `MiniCPMForCausalLM` + * MiniCPM + * `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. + * ✅︎ + * ✅︎ +- * `MiniCPM3ForCausalLM` + * MiniCPM3 + * `openbmb/MiniCPM3-4B`, etc. + * ✅︎ + * ✅︎ +- * `MistralForCausalLM` + * Mistral, Mistral-Instruct + * `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. + * ✅︎ + * ✅︎ +- * `MixtralForCausalLM` + * Mixtral-8x7B, Mixtral-8x7B-Instruct + * `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. + * ✅︎ + * ✅︎ +- * `MPTForCausalLM` + * MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter + * `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. + * + * ✅︎ +- * `NemotronForCausalLM` + * Nemotron-3, Nemotron-4, Minitron + * `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. + * ✅︎ + * ✅︎ +- * `OLMoForCausalLM` + * OLMo + * `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. + * + * ✅︎ +- * `OLMo2ForCausalLM` + * OLMo2 + * `allenai/OLMo2-7B-1124`, etc. + * + * ✅︎ +- * `OLMoEForCausalLM` + * OLMoE + * `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. + * ✅︎ + * ✅︎ +- * `OPTForCausalLM` + * OPT, OPT-IML + * `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. + * + * ✅︎ +- * `OrionForCausalLM` + * Orion + * `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. + * + * ✅︎ +- * `PhiForCausalLM` + * Phi + * `microsoft/phi-1_5`, `microsoft/phi-2`, etc. + * ✅︎ + * ✅︎ +- * `Phi3ForCausalLM` + * Phi-4, Phi-3 + * `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. + * ✅︎ + * ✅︎ +- * `Phi3SmallForCausalLM` + * Phi-3-Small + * `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc. + * + * ✅︎ +- * `PhiMoEForCausalLM` + * Phi-3.5-MoE + * `microsoft/Phi-3.5-MoE-instruct`, etc. + * ✅︎ + * ✅︎ +- * `PersimmonForCausalLM` + * Persimmon + * `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. + * + * ✅︎ +- * `QWenLMHeadModel` + * Qwen + * `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. + * ✅︎ + * ✅︎ +- * `Qwen2ForCausalLM` + * QwQ, Qwen2 + * `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. + * ✅︎ + * ✅︎ +- * `Qwen2MoeForCausalLM` + * Qwen2MoE + * `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. + * + * ✅︎ +- * `StableLmForCausalLM` + * StableLM + * `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. + * + * ✅︎ +- * `Starcoder2ForCausalLM` + * Starcoder2 + * `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. + * + * ✅︎ +- * `SolarForCausalLM` + * Solar Pro + * `upstage/solar-pro-preview-instruct`, etc. + * ✅︎ + * ✅︎ +- * `TeleChat2ForCausalLM` + * TeleChat2 + * `TeleAI/TeleChat2-3B`, `TeleAI/TeleChat2-7B`, `TeleAI/TeleChat2-35B`, etc. + * ✅︎ + * ✅︎ +- * `XverseForCausalLM` + * XVERSE + * `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. + * ✅︎ + * ✅︎ +::: + +:::{note} Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096. -``` +::: ### Pooling Models See [this page](pooling-models) for more information on how to use pooling models. -```{important} +:::{important} Since some model architectures support both generative and pooling tasks, you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode. -``` +::: #### Text Embedding (`--task embed`) -```{list-table} +:::{list-table} :widths: 25 25 50 5 5 :header-rows: 1 -* - Architecture - - Models - - Example HF Models - - [LoRA](#lora-adapter) - - [PP](#distributed-serving) -* - `BertModel` - - BERT-based - - `BAAI/bge-base-en-v1.5`, etc. - - - - -* - `Gemma2Model` - - Gemma2-based - - `BAAI/bge-multilingual-gemma2`, etc. - - - - ✅︎ -* - `GritLM` - - GritLM - - `parasail-ai/GritLM-7B-vllm`. - - ✅︎ - - ✅︎ -* - `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc. - - Llama-based - - `intfloat/e5-mistral-7b-instruct`, etc. - - ✅︎ - - ✅︎ -* - `Qwen2Model`, `Qwen2ForCausalLM` - - Qwen2-based - - `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. - - ✅︎ - - ✅︎ -* - `RobertaModel`, `RobertaForMaskedLM` - - RoBERTa-based - - `sentence-transformers/all-roberta-large-v1`, `sentence-transformers/all-roberta-large-v1`, etc. - - - - -* - `XLMRobertaModel` - - XLM-RoBERTa-based - - `intfloat/multilingual-e5-large`, etc. - - - - -``` - -```{note} +- * Architecture + * Models + * Example HF Models + * [LoRA](#lora-adapter) + * [PP](#distributed-serving) +- * `BertModel` + * BERT-based + * `BAAI/bge-base-en-v1.5`, etc. + * + * +- * `Gemma2Model` + * Gemma2-based + * `BAAI/bge-multilingual-gemma2`, etc. + * + * ✅︎ +- * `GritLM` + * GritLM + * `parasail-ai/GritLM-7B-vllm`. + * ✅︎ + * ✅︎ +- * `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc. + * Llama-based + * `intfloat/e5-mistral-7b-instruct`, etc. + * ✅︎ + * ✅︎ +- * `Qwen2Model`, `Qwen2ForCausalLM` + * Qwen2-based + * `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. + * ✅︎ + * ✅︎ +- * `RobertaModel`, `RobertaForMaskedLM` + * RoBERTa-based + * `sentence-transformers/all-roberta-large-v1`, `sentence-transformers/all-roberta-large-v1`, etc. + * + * +- * `XLMRobertaModel` + * XLM-RoBERTa-based + * `intfloat/multilingual-e5-large`, etc. + * + * +::: + +:::{note} `ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config. You should manually set mean pooling by passing `--override-pooler-config '{"pooling_type": "MEAN"}'`. -``` +::: -```{note} +:::{note} Unlike base Qwen2, `Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention. You can set `--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly. @@ -438,7 +438,7 @@ despite being described otherwise on its model card. Regardless of the variant, you need to enable `--trust-remote-code` for the correct tokenizer to be loaded. See [relevant issue on HF Transformers](https://github.com/huggingface/transformers/issues/34882). -``` +::: If your model is not in the above list, we will try to automatically convert the model using {func}`~vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings @@ -446,98 +446,98 @@ of the whole prompt are extracted from the normalized hidden state corresponding #### Reward Modeling (`--task reward`) -```{list-table} +:::{list-table} :widths: 25 25 50 5 5 :header-rows: 1 -* - Architecture - - Models - - Example HF Models - - [LoRA](#lora-adapter) - - [PP](#distributed-serving) -* - `InternLM2ForRewardModel` - - InternLM2-based - - `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. - - ✅︎ - - ✅︎ -* - `LlamaForCausalLM` - - Llama-based - - `peiyi9979/math-shepherd-mistral-7b-prm`, etc. - - ✅︎ - - ✅︎ -* - `Qwen2ForRewardModel` - - Qwen2-based - - `Qwen/Qwen2.5-Math-RM-72B`, etc. - - ✅︎ - - ✅︎ -* - `Qwen2ForProcessRewardModel` - - Qwen2-based - - `Qwen/Qwen2.5-Math-PRM-7B`, `Qwen/Qwen2.5-Math-PRM-72B`, etc. - - ✅︎ - - ✅︎ -``` +- * Architecture + * Models + * Example HF Models + * [LoRA](#lora-adapter) + * [PP](#distributed-serving) +- * `InternLM2ForRewardModel` + * InternLM2-based + * `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. + * ✅︎ + * ✅︎ +- * `LlamaForCausalLM` + * Llama-based + * `peiyi9979/math-shepherd-mistral-7b-prm`, etc. + * ✅︎ + * ✅︎ +- * `Qwen2ForRewardModel` + * Qwen2-based + * `Qwen/Qwen2.5-Math-RM-72B`, etc. + * ✅︎ + * ✅︎ +- * `Qwen2ForProcessRewardModel` + * Qwen2-based + * `Qwen/Qwen2.5-Math-PRM-7B`, `Qwen/Qwen2.5-Math-PRM-72B`, etc. + * ✅︎ + * ✅︎ +::: If your model is not in the above list, we will try to automatically convert the model using {func}`~vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly. -```{important} +:::{important} For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly, e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`. -``` +::: #### Classification (`--task classify`) -```{list-table} +:::{list-table} :widths: 25 25 50 5 5 :header-rows: 1 -* - Architecture - - Models - - Example HF Models - - [LoRA](#lora-adapter) - - [PP](#distributed-serving) -* - `JambaForSequenceClassification` - - Jamba - - `ai21labs/Jamba-tiny-reward-dev`, etc. - - ✅︎ - - ✅︎ -* - `Qwen2ForSequenceClassification` - - Qwen2-based - - `jason9693/Qwen2.5-1.5B-apeach`, etc. - - ✅︎ - - ✅︎ -``` +- * Architecture + * Models + * Example HF Models + * [LoRA](#lora-adapter) + * [PP](#distributed-serving) +- * `JambaForSequenceClassification` + * Jamba + * `ai21labs/Jamba-tiny-reward-dev`, etc. + * ✅︎ + * ✅︎ +- * `Qwen2ForSequenceClassification` + * Qwen2-based + * `jason9693/Qwen2.5-1.5B-apeach`, etc. + * ✅︎ + * ✅︎ +::: If your model is not in the above list, we will try to automatically convert the model using {func}`~vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token. #### Sentence Pair Scoring (`--task score`) -```{list-table} +:::{list-table} :widths: 25 25 50 5 5 :header-rows: 1 -* - Architecture - - Models - - Example HF Models - - [LoRA](#lora-adapter) - - [PP](#distributed-serving) -* - `BertForSequenceClassification` - - BERT-based - - `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. - - - - -* - `RobertaForSequenceClassification` - - RoBERTa-based - - `cross-encoder/quora-roberta-base`, etc. - - - - -* - `XLMRobertaForSequenceClassification` - - XLM-RoBERTa-based - - `BAAI/bge-reranker-v2-m3`, etc. - - - - -``` +- * Architecture + * Models + * Example HF Models + * [LoRA](#lora-adapter) + * [PP](#distributed-serving) +- * `BertForSequenceClassification` + * BERT-based + * `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. + * + * +- * `RobertaForSequenceClassification` + * RoBERTa-based + * `cross-encoder/quora-roberta-base`, etc. + * + * +- * `XLMRobertaForSequenceClassification` + * XLM-RoBERTa-based + * `BAAI/bge-reranker-v2-m3`, etc. + * + * +::: (supported-mm-models)= @@ -560,11 +560,12 @@ On the other hand, modalities separated by `/` are mutually exclusive. See [this page](#multimodal-inputs) on how to pass multi-modal inputs to the model. -````{important} +:::{important} To enable multiple multi-modal items per text prompt, you have to set `limit_mm_per_prompt` (offline inference) or `--limit-mm-per-prompt` (online serving). For example, to enable passing up to 4 images per text prompt: Offline inference: + ```python llm = LLM( model="Qwen/Qwen2-VL-7B-Instruct", @@ -573,14 +574,16 @@ llm = LLM( ``` Online serving: + ```bash vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4 ``` -```` -```{note} +::: + +:::{note} vLLM currently only supports adding LoRA to the language backbone of multimodal models. -``` +::: ### Generative Models @@ -588,256 +591,256 @@ See [this page](#generative-models) for more information on how to use generativ #### Text Generation (`--task generate`) -```{list-table} +:::{list-table} :widths: 25 25 15 20 5 5 5 :header-rows: 1 -* - Architecture - - Models - - Inputs - - Example HF Models - - [LoRA](#lora-adapter) - - [PP](#distributed-serving) - - [V1](gh-issue:8779) -* - `AriaForConditionalGeneration` - - Aria - - T + I+ - - `rhymes-ai/Aria` - - - - ✅︎ - - ✅︎ -* - `Blip2ForConditionalGeneration` - - BLIP-2 - - T + IE - - `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. - - - - ✅︎ - - ✅︎ -* - `ChameleonForConditionalGeneration` - - Chameleon - - T + I - - `facebook/chameleon-7b` etc. - - - - ✅︎ - - ✅︎ -* - `DeepseekVLV2ForCausalLM` - - DeepSeek-VL2 - - T + I+ - - `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. (see note) - - - - ✅︎ - - ✅︎ -* - `FuyuForCausalLM` - - Fuyu - - T + I - - `adept/fuyu-8b` etc. - - - - ✅︎ - - ✅︎ -* - `ChatGLMModel` - - GLM-4V - - T + I - - `THUDM/glm-4v-9b` etc. - - ✅︎ - - ✅︎ - - -* - `H2OVLChatModel` - - H2OVL - - T + IE+ - - `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. - - - - ✅︎ - - -* - `Idefics3ForConditionalGeneration` - - Idefics3 - - T + I - - `HuggingFaceM4/Idefics3-8B-Llama3` etc. - - ✅︎ - - - - -* - `InternVLChatModel` - - InternVL 2.5, Mono-InternVL, InternVL 2.0 - - T + IE+ - - `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. - - - - ✅︎ - - ✅︎ -* - `LlavaForConditionalGeneration` - - LLaVA-1.5 - - T + IE+ - - `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc. - - - - ✅︎ - - ✅︎ -* - `LlavaNextForConditionalGeneration` - - LLaVA-NeXT - - T + IE+ - - `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. - - - - ✅︎ - - ✅︎ -* - `LlavaNextVideoForConditionalGeneration` - - LLaVA-NeXT-Video - - T + V - - `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. - - - - ✅︎ - - ✅︎ -* - `LlavaOnevisionForConditionalGeneration` - - LLaVA-Onevision - - T + I+ + V+ - - `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. - - - - ✅︎ - - ✅︎ -* - `MiniCPMV` - - MiniCPM-V - - T + IE+ - - `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc. - - ✅︎ - - ✅︎ - - -* - `MllamaForConditionalGeneration` - - Llama 3.2 - - T + I+ - - `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. - - - - - - -* - `MolmoForCausalLM` - - Molmo - - T + I - - `allenai/Molmo-7B-D-0924`, `allenai/Molmo-72B-0924`, etc. - - ✅︎ - - ✅︎ - - ✅︎ -* - `NVLM_D_Model` - - NVLM-D 1.0 - - T + IE+ - - `nvidia/NVLM-D-72B`, etc. - - - - ✅︎ - - ✅︎ -* - `PaliGemmaForConditionalGeneration` - - PaliGemma, PaliGemma 2 - - T + IE - - `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. - - - - ✅︎ - - -* - `Phi3VForCausalLM` - - Phi-3-Vision, Phi-3.5-Vision - - T + IE+ - - `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. - - - - ✅︎ - - ✅︎ -* - `PixtralForConditionalGeneration` - - Pixtral - - T + I+ - - `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b` (see note), etc. - - - - ✅︎ - - ✅︎ -* - `QWenLMHeadModel` - - Qwen-VL - - T + IE+ - - `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. - - ✅︎ - - ✅︎ - - ✅︎ -* - `Qwen2AudioForConditionalGeneration` - - Qwen2-Audio - - T + A+ - - `Qwen/Qwen2-Audio-7B-Instruct` - - - - ✅︎ - - ✅︎ -* - `Qwen2VLForConditionalGeneration` - - QVQ, Qwen2-VL - - T + IE+ + VE+ - - `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. - - ✅︎ - - ✅︎ - - ✅︎ -* - `UltravoxModel` - - Ultravox - - T + AE+ - - `fixie-ai/ultravox-v0_3` - - - - ✅︎ - - ✅︎ -``` +- * Architecture + * Models + * Inputs + * Example HF Models + * [LoRA](#lora-adapter) + * [PP](#distributed-serving) + * [V1](gh-issue:8779) +- * `AriaForConditionalGeneration` + * Aria + * T + I+ + * `rhymes-ai/Aria` + * + * ✅︎ + * ✅︎ +- * `Blip2ForConditionalGeneration` + * BLIP-2 + * T + IE + * `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. + * + * ✅︎ + * ✅︎ +- * `ChameleonForConditionalGeneration` + * Chameleon + * T + I + * `facebook/chameleon-7b` etc. + * + * ✅︎ + * ✅︎ +- * `DeepseekVLV2ForCausalLM` + * DeepSeek-VL2 + * T + I+ + * `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. (see note) + * + * ✅︎ + * ✅︎ +- * `FuyuForCausalLM` + * Fuyu + * T + I + * `adept/fuyu-8b` etc. + * + * ✅︎ + * ✅︎ +- * `ChatGLMModel` + * GLM-4V + * T + I + * `THUDM/glm-4v-9b` etc. + * ✅︎ + * ✅︎ + * +- * `H2OVLChatModel` + * H2OVL + * T + IE+ + * `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. + * + * ✅︎ + * +- * `Idefics3ForConditionalGeneration` + * Idefics3 + * T + I + * `HuggingFaceM4/Idefics3-8B-Llama3` etc. + * ✅︎ + * + * +- * `InternVLChatModel` + * InternVL 2.5, Mono-InternVL, InternVL 2.0 + * T + IE+ + * `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. + * + * ✅︎ + * ✅︎ +- * `LlavaForConditionalGeneration` + * LLaVA-1.5 + * T + IE+ + * `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc. + * + * ✅︎ + * ✅︎ +- * `LlavaNextForConditionalGeneration` + * LLaVA-NeXT + * T + IE+ + * `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. + * + * ✅︎ + * ✅︎ +- * `LlavaNextVideoForConditionalGeneration` + * LLaVA-NeXT-Video + * T + V + * `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. + * + * ✅︎ + * ✅︎ +- * `LlavaOnevisionForConditionalGeneration` + * LLaVA-Onevision + * T + I+ + V+ + * `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. + * + * ✅︎ + * ✅︎ +- * `MiniCPMV` + * MiniCPM-V + * T + IE+ + * `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc. + * ✅︎ + * ✅︎ + * +- * `MllamaForConditionalGeneration` + * Llama 3.2 + * T + I+ + * `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. + * + * + * +- * `MolmoForCausalLM` + * Molmo + * T + I + * `allenai/Molmo-7B-D-0924`, `allenai/Molmo-72B-0924`, etc. + * ✅︎ + * ✅︎ + * ✅︎ +- * `NVLM_D_Model` + * NVLM-D 1.0 + * T + IE+ + * `nvidia/NVLM-D-72B`, etc. + * + * ✅︎ + * ✅︎ +- * `PaliGemmaForConditionalGeneration` + * PaliGemma, PaliGemma 2 + * T + IE + * `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. + * + * ✅︎ + * +- * `Phi3VForCausalLM` + * Phi-3-Vision, Phi-3.5-Vision + * T + IE+ + * `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. + * + * ✅︎ + * ✅︎ +- * `PixtralForConditionalGeneration` + * Pixtral + * T + I+ + * `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b` (see note), etc. + * + * ✅︎ + * ✅︎ +- * `QWenLMHeadModel` + * Qwen-VL + * T + IE+ + * `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. + * ✅︎ + * ✅︎ + * ✅︎ +- * `Qwen2AudioForConditionalGeneration` + * Qwen2-Audio + * T + A+ + * `Qwen/Qwen2-Audio-7B-Instruct` + * + * ✅︎ + * ✅︎ +- * `Qwen2VLForConditionalGeneration` + * QVQ, Qwen2-VL + * T + IE+ + VE+ + * `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. + * ✅︎ + * ✅︎ + * ✅︎ +- * `UltravoxModel` + * Ultravox + * T + AE+ + * `fixie-ai/ultravox-v0_3` + * + * ✅︎ + * ✅︎ +::: E Pre-computed embeddings can be inputted for this modality. + Multiple items can be inputted per text prompt for this modality. -```{note} +:::{note} To use `DeepSeek-VL2` series models, you have to pass `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM. -``` +::: -```{note} +:::{note} To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM. -``` +::: -```{note} +:::{note} The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now. For more details, please see: -``` +::: -```{note} +:::{note} The chat template for Pixtral-HF is incorrect (see [discussion](https://huggingface.co/mistral-community/pixtral-12b/discussions/22)). A corrected version is available at . -``` +::: ### Pooling Models See [this page](pooling-models) for more information on how to use pooling models. -```{important} +:::{important} Since some model architectures support both generative and pooling tasks, you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode. -``` +::: #### Text Embedding (`--task embed`) Any text generation model can be converted into an embedding model by passing `--task embed`. -```{note} +:::{note} To get the best results, you should use pooling models that are specifically trained as such. -``` +::: The following table lists those that are tested in vLLM. -```{list-table} +:::{list-table} :widths: 25 25 15 25 5 5 :header-rows: 1 -* - Architecture - - Models - - Inputs - - Example HF Models - - [LoRA](#lora-adapter) - - [PP](#distributed-serving) -* - `LlavaNextForConditionalGeneration` - - LLaVA-NeXT-based - - T / I - - `royokong/e5-v` - - - - ✅︎ -* - `Phi3VForCausalLM` - - Phi-3-Vision-based - - T + I - - `TIGER-Lab/VLM2Vec-Full` - - 🚧 - - ✅︎ -* - `Qwen2VLForConditionalGeneration` - - Qwen2-VL-based - - T + I - - `MrLight/dse-qwen2-2b-mrl-v1` - - - - ✅︎ -``` +- * Architecture + * Models + * Inputs + * Example HF Models + * [LoRA](#lora-adapter) + * [PP](#distributed-serving) +- * `LlavaNextForConditionalGeneration` + * LLaVA-NeXT-based + * T / I + * `royokong/e5-v` + * + * ✅︎ +- * `Phi3VForCausalLM` + * Phi-3-Vision-based + * T + I + * `TIGER-Lab/VLM2Vec-Full` + * 🚧 + * ✅︎ +- * `Qwen2VLForConditionalGeneration` + * Qwen2-VL-based + * T + I + * `MrLight/dse-qwen2-2b-mrl-v1` + * + * ✅︎ +::: _________________ @@ -849,9 +852,9 @@ At vLLM, we are committed to facilitating the integration and support of third-p 2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results. - ```{tip} + :::{tip} When comparing the output of `model.generate` from HuggingFace Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs. - ``` + ::: 3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback. diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md index daf6e2f250416..3f9ca27eb438e 100644 --- a/docs/source/serving/distributed_serving.md +++ b/docs/source/serving/distributed_serving.md @@ -14,9 +14,9 @@ In short, you should increase the number of GPUs and the number of nodes until y After adding enough GPUs and nodes to hold the model, you can run vLLM first, which will print some logs like `# GPU blocks: 790`. Multiply the number by `16` (the block size), and you can get roughly the maximum number of tokens that can be served on the current configuration. If this number is not satisfying, e.g. you want higher throughput, you can further increase the number of GPUs or nodes, until the number of blocks is enough. -```{note} +:::{note} There is one edge case: if the model fits in a single node with multiple GPUs, but the number of GPUs cannot divide the model size evenly, you can use pipeline parallelism, which splits the model along layers and supports uneven splits. In this case, the tensor parallel size should be 1 and the pipeline parallel size should be the number of GPUs. -``` +::: ## Running vLLM on a single node @@ -94,12 +94,12 @@ vllm serve /path/to/the/model/in/the/container \ To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the `run_cluster.sh` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with `NCCL_DEBUG=TRACE` environment variable set, e.g. `NCCL_DEBUG=TRACE vllm serve ...` and check the logs for the NCCL version and the network used. If you find `[send] via NET/Socket` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find `[send] via NET/IB/GDRDMA` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient. -```{warning} +:::{warning} After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the [sanity check script](#troubleshooting-incorrect-hardware-driver) for more information. If you need to set some environment variables for the communication configuration, you can append them to the `run_cluster.sh` script, e.g. `-e NCCL_SOCKET_IFNAME=eth0`. Note that setting environment variables in the shell (e.g. `NCCL_SOCKET_IFNAME=eth0 vllm serve ...`) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See for more information. -``` +::: -```{warning} +:::{warning} Please make sure you downloaded the model to all the nodes (with the same path), or the model is downloaded to some distributed file system that is accessible by all nodes. When you use huggingface repo id to refer to the model, you should append your huggingface token to the `run_cluster.sh` script, e.g. `-e HF_TOKEN=`. The recommended way is to download the model first, and then use the path to refer to the model. -``` +::: diff --git a/docs/source/serving/engine_args.md b/docs/source/serving/engine_args.md index cd3c6a430b7fa..827c25b50522f 100644 --- a/docs/source/serving/engine_args.md +++ b/docs/source/serving/engine_args.md @@ -4,6 +4,7 @@ Below, you can find an explanation of every engine argument for vLLM: + ```{eval-rst} .. argparse:: :module: vllm.engine.arg_utils @@ -16,6 +17,7 @@ Below, you can find an explanation of every engine argument for vLLM: Below are the additional arguments related to the asynchronous engine: + ```{eval-rst} .. argparse:: :module: vllm.engine.arg_utils diff --git a/docs/source/serving/env_vars.md b/docs/source/serving/env_vars.md index f9b08077a03b4..9845241930a40 100644 --- a/docs/source/serving/env_vars.md +++ b/docs/source/serving/env_vars.md @@ -2,14 +2,14 @@ vLLM uses the following environment variables to configure the system: -```{warning} +:::{warning} Please note that `VLLM_PORT` and `VLLM_HOST_IP` set the port and ip for vLLM's **internal usage**. It is not the port and ip for the API server. If you use `--host $VLLM_HOST_IP` and `--port $VLLM_PORT` to start the API server, it will not work. All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables). -``` +::: -```{literalinclude} ../../../vllm/envs.py +:::{literalinclude} ../../../vllm/envs.py :end-before: end-env-vars-definition :language: python :start-after: begin-env-vars-definition -``` +::: diff --git a/docs/source/serving/integrations/index.md b/docs/source/serving/integrations/index.md index 371c284981ce9..e2b4c0814605b 100644 --- a/docs/source/serving/integrations/index.md +++ b/docs/source/serving/integrations/index.md @@ -1,8 +1,8 @@ # External Integrations -```{toctree} +:::{toctree} :maxdepth: 1 langchain llamaindex -``` +::: diff --git a/docs/source/serving/metrics.md b/docs/source/serving/metrics.md index 6c84f6d1350a6..6c0dc8880a90d 100644 --- a/docs/source/serving/metrics.md +++ b/docs/source/serving/metrics.md @@ -31,8 +31,8 @@ vllm:iteration_tokens_total_bucket{le="512.0",model_name="unsloth/Llama-3.2-1B-I The following metrics are exposed: -```{literalinclude} ../../../vllm/engine/metrics.py +:::{literalinclude} ../../../vllm/engine/metrics.py :end-before: end-metrics-definitions :language: python :start-after: begin-metrics-definitions -``` +::: diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md index 0213b0a3388ea..217b531e83788 100644 --- a/docs/source/serving/multimodal_inputs.md +++ b/docs/source/serving/multimodal_inputs.md @@ -4,10 +4,10 @@ This page teaches you how to pass multi-modal inputs to [multi-modal models](#supported-mm-models) in vLLM. -```{note} +:::{note} We are actively iterating on multi-modal support. See [this RFC](gh-issue:4194) for upcoming changes, and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) if you have any feedback or feature requests. -``` +::: ## Offline Inference @@ -203,13 +203,13 @@ for o in outputs: Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat). -```{important} +:::{important} A chat template is **required** to use Chat Completions API. Although most models come with a chat template, for others you have to define one yourself. The chat template can be inferred based on the documentation on the model's HuggingFace repo. For example, LLaVA-1.5 (`llava-hf/llava-1.5-7b-hf`) requires a chat template that can be found here: -``` +::: ### Image @@ -273,24 +273,25 @@ print("Chat completion output:", chat_response.choices[0].message.content) Full example: -```{tip} +:::{tip} Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine, and pass the file path as `url` in the API request. -``` +::: -```{tip} +:::{tip} There is no need to place image placeholders in the text content of the API request - they are already represented by the image content. In fact, you can place image placeholders in the middle of the text by interleaving text and image content. -``` +::: -````{note} +:::{note} By default, the timeout for fetching images through HTTP URL is `5` seconds. You can override this by setting the environment variable: ```console -$ export VLLM_IMAGE_FETCH_TIMEOUT= +export VLLM_IMAGE_FETCH_TIMEOUT= ``` -```` + +::: ### Video @@ -345,14 +346,15 @@ print("Chat completion output from image url:", result) Full example: -````{note} +:::{note} By default, the timeout for fetching videos through HTTP URL is `30` seconds. You can override this by setting the environment variable: ```console -$ export VLLM_VIDEO_FETCH_TIMEOUT= +export VLLM_VIDEO_FETCH_TIMEOUT= ``` -```` + +::: ### Audio @@ -448,24 +450,25 @@ print("Chat completion output from audio url:", result) Full example: -````{note} +:::{note} By default, the timeout for fetching audios through HTTP URL is `10` seconds. You can override this by setting the environment variable: ```console -$ export VLLM_AUDIO_FETCH_TIMEOUT= +export VLLM_AUDIO_FETCH_TIMEOUT= ``` -```` + +::: ### Embedding vLLM's Embeddings API is a superset of OpenAI's [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings), where a list of chat `messages` can be passed instead of batched `inputs`. This enables multi-modal inputs to be passed to embedding models. -```{tip} +:::{tip} The schema of `messages` is exactly the same as in Chat Completions API. You can refer to the above tutorials for more details on how to pass each type of multi-modal data. -``` +::: Usually, embedding models do not expect chat-based input, so we need to use a custom chat template to format the text and images. Refer to the examples below for illustration. @@ -477,13 +480,13 @@ vllm serve TIGER-Lab/VLM2Vec-Full --task embed \ --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja ``` -```{important} +:::{important} Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--task embed` to run this model in embedding mode instead of text generation mode. The custom chat template is completely different from the original one for this model, and can be found here: -``` +::: Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library: @@ -518,16 +521,16 @@ vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embed \ --trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja ``` -```{important} +:::{important} Like with VLM2Vec, we have to explicitly pass `--task embed`. Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled by a custom chat template: -``` +::: -```{important} +:::{important} Also important, `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code example below for details. -``` +::: Full example: diff --git a/docs/source/serving/offline_inference.md b/docs/source/serving/offline_inference.md index 8a18598665a70..ded57500c5d0d 100644 --- a/docs/source/serving/offline_inference.md +++ b/docs/source/serving/offline_inference.md @@ -22,9 +22,9 @@ The available APIs depend on the type of model that is being run: Please refer to the above pages for more details about each API. -```{seealso} +:::{seealso} [API Reference](/api/offline_inference/index) -``` +::: ## Configuration Options @@ -70,12 +70,12 @@ llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2) ``` -```{important} +:::{important} To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. {func}`torch.cuda.set_device`) before initializing vLLM. Otherwise, you may run into an error like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`. To control which devices are used, please instead set the `CUDA_VISIBLE_DEVICES` environment variable. -``` +::: #### Quantization diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index 8bc234545befd..82ef54c16dafb 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -161,11 +161,11 @@ print(completion._request_id) The `vllm serve` command is used to launch the OpenAI-compatible server. -```{argparse} +:::{argparse} :module: vllm.entrypoints.openai.cli_args :func: create_parser_for_docs :prog: vllm serve -``` +::: #### Configuration file @@ -188,10 +188,10 @@ To use the above config file: vllm serve SOME_MODEL --config config.yaml ``` -```{note} +:::{note} In case an argument is supplied simultaneously using command line and the config file, the value from the command line will take precedence. The order of priorities is `command line > config file values > defaults`. -``` +::: ## API Reference @@ -208,19 +208,19 @@ Code example: The following [sampling parameters](#sampling-params) are supported. -```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python :start-after: begin-completion-sampling-params :end-before: end-completion-sampling-params -``` +::: The following extra parameters are supported: -```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python :start-after: begin-completion-extra-params :end-before: end-completion-extra-params -``` +::: (chat-api)= @@ -240,19 +240,19 @@ Code example: The following [sampling parameters](#sampling-params) are supported. -```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python :start-after: begin-chat-completion-sampling-params :end-before: end-chat-completion-sampling-params -``` +::: The following extra parameters are supported: -```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python :start-after: begin-chat-completion-extra-params :end-before: end-chat-completion-extra-params -``` +::: (embeddings-api)= @@ -264,9 +264,9 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai If the model has a [chat template](#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat API](#chat-api)) which will be treated as a single prompt to the model. -```{tip} +:::{tip} This enables multi-modal inputs to be passed to embedding models, see [this page](#multimodal-inputs) for details. -``` +::: Code example: @@ -274,27 +274,27 @@ Code example: The following [pooling parameters](#pooling-params) are supported. -```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python :start-after: begin-embedding-pooling-params :end-before: end-embedding-pooling-params -``` +::: The following extra parameters are supported by default: -```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python :start-after: begin-embedding-extra-params :end-before: end-embedding-extra-params -``` +::: For chat-like input (i.e. if `messages` is passed), these extra parameters are supported instead: -```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python :start-after: begin-chat-embedding-extra-params :end-before: end-chat-embedding-extra-params -``` +::: (tokenizer-api)= @@ -465,19 +465,19 @@ Response: The following [pooling parameters](#pooling-params) are supported. -```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python :start-after: begin-score-pooling-params :end-before: end-score-pooling-params -``` +::: The following extra parameters are supported: -```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python :start-after: begin-score-extra-params :end-before: end-score-extra-params -``` +::: (rerank-api)= @@ -552,16 +552,16 @@ Response: The following [pooling parameters](#pooling-params) are supported. -```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python :start-after: begin-rerank-pooling-params :end-before: end-rerank-pooling-params -``` +::: The following extra parameters are supported: -```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python :start-after: begin-rerank-extra-params :end-before: end-rerank-extra-params -``` +::: diff --git a/pyproject.toml b/pyproject.toml index 8f2e20d0f5800..9892967b82d79 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -111,6 +111,7 @@ markers = [ ] [tool.pymarkdown] +plugins.md004.style = "sublist" # ul-style plugins.md013.enabled = false # line-length plugins.md041.enabled = false # first-line-h1 plugins.md033.enabled = false # inline-html From 46fb056749b7d9f5e4ea7a060207ed2eb3ad75e0 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Wed, 29 Jan 2025 04:11:16 +0000 Subject: [PATCH 28/69] [V1][Metrics] Add TTFT and TPOT histograms (#12530) Signed-off-by: Mark McLoughlin --- tests/entrypoints/openai/test_metrics.py | 6 ++++++ vllm/v1/engine/output_processor.py | 4 +++- vllm/v1/metrics/loggers.py | 25 ++++++++++++++++++++++++ vllm/v1/metrics/stats.py | 11 +++++++++++ 4 files changed, 45 insertions(+), 1 deletion(-) diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 9a84c82b62fdf..901ba8e8e5ef3 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -208,6 +208,12 @@ async def test_metrics_counts(server: RemoteOpenAIServer, "vllm:request_generation_tokens_sum", "vllm:request_generation_tokens_bucket", "vllm:request_generation_tokens_count", + "vllm:time_to_first_token_seconds_sum", + "vllm:time_to_first_token_seconds_bucket", + "vllm:time_to_first_token_seconds_count", + "vllm:time_per_output_token_seconds_sum", + "vllm:time_per_output_token_seconds_bucket", + "vllm:time_per_output_token_seconds_count", ] diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 39217b8090140..234ef8194ca93 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -27,6 +27,7 @@ def __init__( prompt: Optional[str], prompt_token_ids: List[int], detokenizer: IncrementalDetokenizer, + arrival_time: float, queue: Optional[asyncio.Queue[RequestOutput]], ): self.request_id = request_id @@ -37,7 +38,7 @@ def __init__( self.is_prefilling = True self.queue = queue - self.stats = RequestStateStats() + self.stats = RequestStateStats(last_token_time=arrival_time) @classmethod def from_new_request( @@ -54,6 +55,7 @@ def from_new_request( tokenizer=tokenizer, request=request, ), + arrival_time=request.arrival_time, queue=queue, ) diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 87d9d63652c05..9bb24d1948651 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -121,6 +121,26 @@ def __init__(self, model_config: ModelConfig): buckets=build_1_2_5_buckets(max_model_len), labelnames=labelnames).labels(*labelvalues) + self.histogram_time_to_first_token = \ + prometheus_client.Histogram( + name="vllm:time_to_first_token_seconds", + documentation="Histogram of time to first token in seconds.", + buckets=[ + 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, + 0.75, 1.0, 2.5, 5.0, 7.5, 10.0 + ], + labelnames=labelnames).labels(*labelvalues) + + self.histogram_time_per_output_token = \ + prometheus_client.Histogram( + name="vllm:time_per_output_token_seconds", + documentation="Histogram of time per output token in seconds.", + buckets=[ + 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, + 0.75, 1.0, 2.5 + ], + labelnames=labelnames).labels(*labelvalues) + def log(self, scheduler_stats: SchedulerStats, iteration_stats: IterationStats): """Log to prometheus.""" @@ -137,6 +157,11 @@ def log(self, scheduler_stats: SchedulerStats, self.histogram_num_generation_tokens_request.observe( finished_request.num_generation_tokens) + for ttft in iteration_stats.time_to_first_tokens_iter: + self.histogram_time_to_first_token.observe(ttft) + for tpot in iteration_stats.time_per_output_tokens_iter: + self.histogram_time_per_output_token.observe(tpot) + @staticmethod def _unregister_vllm_metrics(): # Unregister any existing vLLM collectors (for CI/CD diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 55d85a7992cc5..f4c276f0b6902 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -1,3 +1,4 @@ +import time from dataclasses import dataclass from typing import TYPE_CHECKING, List @@ -22,6 +23,7 @@ class RequestStateStats: """Stats that need to be tracked across delta updates.""" num_generation_tokens: int = 0 + last_token_time: float = 0.0 @dataclass @@ -40,6 +42,8 @@ def __init__(self, log_stats: bool): self.num_generation_tokens = 0 self.num_prompt_tokens = 0 self.finished_requests: List[FinishedRequestStats] = [] + self.time_to_first_tokens_iter: List[float] = [] + self.time_per_output_tokens_iter: List[float] = [] def update_from_output(self, output: "EngineCoreOutput", is_prefilling: bool, prompt_len: int, @@ -48,6 +52,8 @@ def update_from_output(self, output: "EngineCoreOutput", return num_new_generation_tokens = len(output.new_token_ids) + now = time.time() + last_token_latency = now - request_state_stats.last_token_time self.num_generation_tokens += num_new_generation_tokens if is_prefilling: @@ -58,7 +64,12 @@ def update_from_output(self, output: "EngineCoreOutput", assert (num_new_generation_tokens > 0) self.num_prompt_tokens += prompt_len + self.time_to_first_tokens_iter.append(last_token_latency) + else: + self.time_per_output_tokens_iter.append(last_token_latency) + request_state_stats.num_generation_tokens += num_new_generation_tokens + request_state_stats.last_token_time = now def update_from_finished_request(self, request_output: "RequestOutput", request_state_stats: RequestStateStats): From bd02164cf9eeed8436b26d62c37c1d792e97f9e8 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 28 Jan 2025 23:49:03 -0500 Subject: [PATCH 29/69] Bugfix for whisper quantization due to fake k_proj bias (#12524) Signed-off-by: mgoin --- vllm/model_executor/models/whisper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index b8512b735da94..15e35fa9cd2c9 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -743,7 +743,7 @@ def _create_fake_bias_for_k_proj( So that the bias for k_proj in qkv_proj can be initialized with zeros. """ for name, weight in weights: - if ".self_attn.k_proj.weight" in name: + if name.endswith(".self_attn.k_proj.weight"): bias = torch.zeros(weight.size(0)) bias_name = name.replace("weight", "bias") yield from [(name, weight), (bias_name, bias)] From 5f671cb4c3145194e94ffb393ee459432f7fa2b8 Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Date: Tue, 28 Jan 2025 23:56:56 -0500 Subject: [PATCH 30/69] [V1] Improve Error Message for Unsupported Config (#12535) Co-authored-by: Michael Goin --- vllm/platforms/cuda.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 2587e3a11dde3..e4b436edf7588 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -120,13 +120,18 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: if parallel_config.worker_cls == "auto": if scheduler_config.is_multi_step: if envs.VLLM_USE_V1: - raise NotImplementedError + raise NotImplementedError( + "Multi-step scheduling is not supported (and not " + "needed) on VLLM V1. Please launch without " + "--num-scheduler-steps.") else: parallel_config.worker_cls = \ "vllm.worker.multi_step_worker.MultiStepWorker" elif vllm_config.speculative_config: if envs.VLLM_USE_V1: - raise NotImplementedError + raise NotImplementedError( + "Speculative decoding is not yet supported on VLLM V1." + ) else: parallel_config.worker_cls = \ "vllm.spec_decode.spec_decode_worker.create_spec_worker" From ef001d98ef36166ebacb48eab2e32eb738407b53 Mon Sep 17 00:00:00 2001 From: Maximilien de Bayser Date: Wed, 29 Jan 2025 04:53:13 -0300 Subject: [PATCH 31/69] Fix the pydantic logging validator (#12420) Signed-off-by: Max de Bayser --- vllm/entrypoints/openai/protocol.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 2bc136cc48038..29d071ce50c8e 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -6,7 +6,8 @@ from typing import Any, ClassVar, Dict, List, Literal, Optional, Set, Union import torch -from pydantic import BaseModel, ConfigDict, Field, model_validator +from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter, + ValidationInfo, field_validator, model_validator) from typing_extensions import Annotated from vllm.entrypoints.chat_utils import ChatCompletionMessageParam @@ -45,14 +46,14 @@ class OpenAIBaseModel(BaseModel): # Cache class field names field_names: ClassVar[Optional[Set[str]]] = None - @model_validator(mode="before") + @model_validator(mode="wrap") @classmethod - def __log_extra_fields__(cls, data): - + def __log_extra_fields__(cls, data, handler): + result = handler(data) + if not isinstance(data, dict): + return result field_names = cls.field_names if field_names is None: - if not isinstance(data, dict): - return data # Get all class field names and their potential aliases field_names = set() for field_name, field in cls.model_fields.items(): @@ -67,7 +68,7 @@ def __log_extra_fields__(cls, data): "The following fields were present in the request " "but ignored: %s", data.keys() - field_names) - return data + return result class ErrorResponse(OpenAIBaseModel): @@ -1287,6 +1288,20 @@ class BatchRequestInput(OpenAIBaseModel): # The parameters of the request. body: Union[ChatCompletionRequest, EmbeddingRequest, ScoreRequest] + @field_validator('body', mode='plain') + @classmethod + def check_type_for_url(cls, value: Any, info: ValidationInfo): + # Use url to disambiguate models + url = info.data['url'] + if url == "/v1/chat/completions": + return ChatCompletionRequest.model_validate(value) + if url == "/v1/embeddings": + return TypeAdapter(EmbeddingRequest).validate_python(value) + if url == "/v1/score": + return ScoreRequest.model_validate(value) + return TypeAdapter(Union[ChatCompletionRequest, EmbeddingRequest, + ScoreRequest]).validate_python(value) + class BatchResponseData(OpenAIBaseModel): # HTTP status code of the response. From 036ca94c25fa07391016aa1b4f93a8ac5d74f296 Mon Sep 17 00:00:00 2001 From: Travis Johnson Date: Wed, 29 Jan 2025 01:54:35 -0700 Subject: [PATCH 32/69] [Bugfix] handle alignment of arguments in convert_sparse_cross_attention_mask_to_dense (#12347) Signed-off-by: Travis Johnson Signed-off-by: Wallas Santos Co-authored-by: Wallas Santos --- .../vision_language/test_mllama.py | 208 ++++++++++++++++++ vllm/model_executor/models/mllama.py | 18 +- 2 files changed, 222 insertions(+), 4 deletions(-) diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py index 636a3eedff31b..16c71228ede7a 100644 --- a/tests/models/encoder_decoder/vision_language/test_mllama.py +++ b/tests/models/encoder_decoder/vision_language/test_mllama.py @@ -1,11 +1,15 @@ from typing import List, Optional, Tuple, Type, overload import pytest +import torch from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer, BatchEncoding) +from vllm.attention.backends.flash_attn import FlashAttentionMetadata from vllm.attention.selector import (_Backend, _cached_get_attn_backend, global_force_attn_backend_context_manager) +from vllm.model_executor.models.mllama import (MLLAMA_IMAGE_TOKEN_ID, + MllamaForConditionalGeneration) from vllm.multimodal.image import rescale_image_size from vllm.sequence import SampleLogprobs @@ -33,6 +37,29 @@ "meta-llama/Llama-3.2-11B-Vision-Instruct", ] +# Indices for inputs +TEXT_ONLY = '0' +IMAGE_AT_BEG = '1' +IMAGE_AT_MIDDLE = '2' +TWO_IMAGES = '3' + +# Input tokenized +prompt_data = { + # Tell me a story + TEXT_ONLY: [41551, 757, 264, 3446], + # <|image|> What's the content of this image + IMAGE_AT_BEG: + [MLLAMA_IMAGE_TOKEN_ID, 3639, 596, 279, 2262, 315, 420, 2217, 220], + # Hello <|image|>What' the content of this image + IMAGE_AT_MIDDLE: + [9906, 220, MLLAMA_IMAGE_TOKEN_ID, 3923, 6, 279, 2262, 315, 420, 2217], + #<|image|>Is there a duck in this image?<|image|>What's the animal in this image? # noqa: E501 + TWO_IMAGES: [ + MLLAMA_IMAGE_TOKEN_ID, 3957, 1070, 264, 37085, 304, 420, 2217, 30, + MLLAMA_IMAGE_TOKEN_ID, 3923, 596, 279, 10065, 304, 420, 2217, 30 + ] +} + def vllm_to_hf_output(vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]], @@ -365,3 +392,184 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model, num_logprobs=num_logprobs, tensor_parallel_size=1, ) + + +@large_gpu_test(min_gb=48) +@pytest.mark.core_model +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("dtype", ["bfloat16"]) +@pytest.mark.parametrize("max_tokens", [128]) +@pytest.mark.parametrize("num_logprobs", [5]) +@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) +def test_regression(vllm_runner, image_assets, model, dtype, max_tokens, + num_logprobs, attn_backend: _Backend) -> None: + + stop_sign = image_assets[0].pil_image + + with global_force_attn_backend_context_manager(attn_backend), vllm_runner( + model, + dtype=dtype, + max_model_len=4096, + max_num_seqs=2, + tensor_parallel_size=1, + enforce_eager=True, + limit_mm_per_prompt={"image": + _LIMIT_IMAGE_PER_PROMPT}) as vllm_model: + + # Regression tests for https://github.com/vllm-project/vllm/issues/10648 + + # Number of image tags is greater than the number of images provided + prompt = "<|begin_of_text|><|image|><|image|> Compare the two images" # noqa: E501 + image = stop_sign + with pytest.raises(ValueError): + vllm_model.generate_greedy_logprobs([prompt], + max_tokens, + num_logprobs, + images=[image]) + + # Batch of a text-only and image request that requires cross-attention + prompts = [ + "What is the capital of spain?", + "Text before the image...<|image|>What is in the image?", # noqa: E501 + ] + images = [ + None, + [stop_sign], + ] + vllm_model.generate_greedy_logprobs(prompts, + max_tokens, + num_logprobs, + images=images) + + # Test the reverse order too for good measure + prompts = [ + "<|begin_of_text|>Text before the image...<|image|>What is in the image?", # noqa: E501 + "<|begin_of_text|>Hello!", + ] + images = [ + [stop_sign], + None, + ] + vllm_model.generate_greedy_logprobs(prompts, + max_tokens, + num_logprobs, + images=images) + + +@pytest.mark.core_model +@pytest.mark.parametrize( + "input_indices_and_output", + # inputs, (cross_attention_mask, kv_range_for_decode) + [([TEXT_ONLY], (None, None)), ([IMAGE_AT_BEG], (None, None)), + ([TEXT_ONLY, IMAGE_AT_BEG], (None, None)), + ([IMAGE_AT_MIDDLE], ((10, 12), [[0, 6]])), + ([TEXT_ONLY, IMAGE_AT_MIDDLE], ((14, 12), [[0, 6]])), + ([TEXT_ONLY, IMAGE_AT_BEG, IMAGE_AT_MIDDLE], + ((23, 24), [[0, 6], [6, 12]])), + ([IMAGE_AT_MIDDLE, TEXT_ONLY], ((14, 12), [[0, 6]])), + ([TWO_IMAGES], ((18, 12), [[6, 12]])), + ([TEXT_ONLY, TWO_IMAGES], ((22, 12), [[6, 12]]))]) +def test_get_cross_attention_mask(input_indices_and_output) -> None: + + input_indices, expected_output = input_indices_and_output + + sequences = [torch.tensor(prompt_data[i]) for i in input_indices] + num_tiles = [[2, 2] if i != TEXT_ONLY else [] for i in input_indices + if i != TEXT_ONLY] + input = torch.cat(sequences) + + seq_lens = [len(s) for s in sequences] + + attn_data = FlashAttentionMetadata( + seq_lens=seq_lens, + # Dummy values + enable_kv_scales_calculation=False, + num_prefills=0, + num_prefill_tokens=0, + num_decode_tokens=0, + slot_mapping=0, + multi_modal_placeholder_index_maps=None, + seq_lens_tensor=0, + max_prefill_seq_len=0, + max_decode_seq_len=0, + context_lens_tensor=None, + block_tables=None, + use_cuda_graph=False, + ) + + dummy: dict[str, str] = {} + + cross_attention_mask, kv_range_for_decode = MllamaForConditionalGeneration\ + .get_cross_attention_mask(dummy, + input, + attn_data, + num_tiles=num_tiles, + num_tokens_per_tile=3, + dtype=torch.bfloat16) + + expected_cross_attention_mask, expected_kv_range_for_decode = \ + expected_output + + assert kv_range_for_decode == expected_kv_range_for_decode + if expected_cross_attention_mask is not None: + assert cross_attention_mask is not None + assert cross_attention_mask.shape == expected_cross_attention_mask + else: + assert cross_attention_mask is None + + +@pytest.mark.core_model +@pytest.mark.parametrize( + "input_indices", + [[TEXT_ONLY], [IMAGE_AT_BEG], [TEXT_ONLY, IMAGE_AT_BEG], [IMAGE_AT_MIDDLE], + [TEXT_ONLY, IMAGE_AT_MIDDLE], [TEXT_ONLY, IMAGE_AT_BEG, IMAGE_AT_MIDDLE], + [IMAGE_AT_MIDDLE, TEXT_ONLY], [TWO_IMAGES], [TEXT_ONLY, TWO_IMAGES]]) +def test_get_full_text_row_masked_out_mask(input_indices) -> None: + + sequences = [torch.tensor(prompt_data[i]) for i in input_indices] + + seq_lens = [len(s) for s in sequences] + + num_prefill_tokens = sum(seq_lens) + + # TEXT_ONLY is zero, so it will be masked out, + # other instances should not be. + encoder_seq_lens = [int(i) for i in input_indices] + + attn_data = FlashAttentionMetadata( + seq_lens=seq_lens, + encoder_seq_lens=encoder_seq_lens, + num_prefill_tokens=num_prefill_tokens, + # Dummy values + enable_kv_scales_calculation=False, + num_prefills=0, + num_decode_tokens=0, + slot_mapping=0, + multi_modal_placeholder_index_maps=None, + seq_lens_tensor=0, + max_prefill_seq_len=0, + max_decode_seq_len=0, + context_lens_tensor=None, + block_tables=None, + use_cuda_graph=False, + ) + + dummy: dict[str, str] = {} + + full_text_row_masked_out_mask = MllamaForConditionalGeneration\ + .get_full_text_row_masked_out_mask(dummy, + attn_data, + torch.get_default_device()) + + full_text_row_masked_out_mask = full_text_row_masked_out_mask.squeeze() + full_text_row_masked_out_mask = full_text_row_masked_out_mask.tolist() + + idx = 0 + assert len(full_text_row_masked_out_mask) == num_prefill_tokens + for i, seq_len in enumerate(seq_lens): + must_be_masked = input_indices[i] != TEXT_ONLY + for _ in range(seq_len): + assert full_text_row_masked_out_mask[idx] == must_be_masked, \ + f"full_text_row_masked_out_mask[{idx}] must be " \ + f"'{must_be_masked}' " + idx += 1 diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index e15ac84a6049b..34b8624647ce6 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -1485,14 +1485,23 @@ def convert_sparse_cross_attention_mask_to_dense( total_length = sum(lengths) total_tiles = sum([sum(tiles) for tiles in num_tiles]) dense_mask = np.zeros(shape=(total_length, total_tiles), dtype=np.int64) - # A list of ranges, range[i] = [start, end] means - # if the i-th sample has N tiles in total, the tiles[start, end] - # will be used for cross-attention decoding. + # A list of ranges, range[i] = [start, end] means that the i-th image will + # use tiles[start, end] for cross-attention decoding. tile_range_for_decode = [] seq_start = 0 tile_start = 0 - for masks, tiles, length in zip(sparse_mask, num_tiles, lengths): + + # sparse_mask has an [] entry for each sequence that does not have images, + # but num_tiles does not have these entries... + num_tiles_idx = 0 + for masks, length in zip(sparse_mask, lengths): + if len(masks) == 0: + # Text only + continue + + tiles = num_tiles[num_tiles_idx] + num_tiles_idx += 1 ts, td = -1, 0 for mask, tile in zip(masks, tiles): if len(mask) != 2: @@ -1512,6 +1521,7 @@ def convert_sparse_cross_attention_mask_to_dense( assert td != 0 tile_range_for_decode.append((ts, ts + td)) seq_start += length + assert num_tiles_idx == len(num_tiles) return dense_mask, tile_range_for_decode From d93bf4da855a0c5e8d3c875def6b37c5e9d77763 Mon Sep 17 00:00:00 2001 From: Alphi <52458637+HwwwwwwwH@users.noreply.github.com> Date: Wed, 29 Jan 2025 17:24:59 +0800 Subject: [PATCH 33/69] [Model] Refactoring of MiniCPM-V and add MiniCPM-o-2.6 support for vLLM (#12069) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: hzh Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com> Signed-off-by: shaochangxu.scx Signed-off-by: DarkLight1337 Signed-off-by: NickLucche Signed-off-by: Isotr0py <2037008807@qq.com> Signed-off-by: Roger Wang Signed-off-by: Rafael Vasquez Signed-off-by: Akshat Tripathi Signed-off-by: Oleg Mosalov Signed-off-by: Jee Jee Li Signed-off-by: rshaw@neuralmagic.com Signed-off-by: Yida Wu Signed-off-by: Chenguang Li <757486878@qq.com> Signed-off-by: youkaichao Signed-off-by: Alex-Brooks Signed-off-by: Chen Zhang Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: Shanshan Shen <467638484@qq.com> Signed-off-by: elijah Signed-off-by: Yikun Signed-off-by: mgoin Signed-off-by: Woosuk Kwon Signed-off-by: Konrad Zawora Signed-off-by: tjtanaa Signed-off-by: wangxiyuan Signed-off-by: Rui Qiao Co-authored-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com> Co-authored-by: shaochangxu <85155497+shaochangxu@users.noreply.github.com> Co-authored-by: shaochangxu.scx Co-authored-by: Cyrus Leung Co-authored-by: Nicolò Lucchesi Co-authored-by: sixgod Co-authored-by: Isotr0py <2037008807@qq.com> Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> Co-authored-by: Rafael Vasquez Co-authored-by: Isotr0py Co-authored-by: Cyrus Leung Co-authored-by: Akshat Tripathi Co-authored-by: Oleg Mosalov Co-authored-by: Jee Jee Li Co-authored-by: Avshalom Manevich <12231371+avshalomman@users.noreply.github.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Co-authored-by: Yangcheng Li Co-authored-by: Siyuan Li <94890248+liaoyanqing666@users.noreply.github.com> Co-authored-by: Concurrensee Co-authored-by: Chenguang Li <757486878@qq.com> Co-authored-by: youkaichao Co-authored-by: Alex Brooks Co-authored-by: Chen Zhang Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Shanshan Shen <467638484@qq.com> Co-authored-by: elijah <30852919+e1ijah1@users.noreply.github.com> Co-authored-by: Yikun Jiang Co-authored-by: Steve Luo <36296769+SunflowerAries@users.noreply.github.com> Co-authored-by: mgoin Co-authored-by: Woosuk Kwon Co-authored-by: Konrad Zawora Co-authored-by: TJian Co-authored-by: tjtanaa Co-authored-by: wangxiyuan Co-authored-by: maang-h <55082429+maang-h@users.noreply.github.com> Co-authored-by: Elfie Guo <164945471+elfiegg@users.noreply.github.com> Co-authored-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com> Co-authored-by: Roger Wang --- docs/source/models/supported_models.md | 9 +- examples/offline_inference/audio_language.py | 32 +- examples/offline_inference/vision_language.py | 33 +- requirements-cpu.txt | 1 + requirements-cuda.txt | 1 + requirements-test.in | 3 + requirements-test.txt | 37 +- .../vision_language/test_models.py | 14 + .../vision_language/vlm_utils/model_utils.py | 11 + .../multimodal/processing/test_common.py | 2 + tests/models/registry.py | 4 +- vllm/entrypoints/chat_utils.py | 6 +- vllm/model_executor/models/minicpmo.py | 811 +++++++++++++++++ vllm/model_executor/models/minicpmv.py | 843 ++++++++++++++---- vllm/model_executor/models/registry.py | 1 + 15 files changed, 1622 insertions(+), 186 deletions(-) create mode 100644 vllm/model_executor/models/minicpmo.py diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 94f4bd6cadabd..afaad8818bdcb 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -693,9 +693,16 @@ See [this page](#generative-models) for more information on how to use generativ * * ✅︎ * ✅︎ +- * `MiniCPMO` + * MiniCPM-O + * T + IE+ + VE+ + AE+ + * `openbmb/MiniCPM-o-2_6`, etc. + * ✅︎ + * ✅︎ + * - * `MiniCPMV` * MiniCPM-V - * T + IE+ + * T + IE+ + VE+ * `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc. * ✅︎ * ✅︎ diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py index 6fd74782a9aae..5952ec13ec3cb 100644 --- a/examples/offline_inference/audio_language.py +++ b/examples/offline_inference/audio_language.py @@ -67,7 +67,37 @@ def run_qwen2_audio(question: str, audio_count: int): return llm, prompt, stop_token_ids -model_example_map = {"ultravox": run_ultravox, "qwen2_audio": run_qwen2_audio} +def run_minicpmo(question: str, audio_count: int): + model_name = "openbmb/MiniCPM-o-2_6" + tokenizer = AutoTokenizer.from_pretrained(model_name, + trust_remote_code=True) + llm = LLM(model=model_name, + trust_remote_code=True, + max_model_len=4096, + max_num_seqs=5, + limit_mm_per_prompt={"audio": audio_count}) + + stop_tokens = ['<|im_end|>', '<|endoftext|>'] + stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] + + audio_placeholder = "()" * audio_count + audio_chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n<|spk_bos|><|spk|><|spk_eos|><|tts_bos|>' }}{% endif %}" # noqa: E501 + messages = [{ + 'role': 'user', + 'content': f'{audio_placeholder}\n{question}' + }] + prompt = tokenizer.apply_chat_template(messages, + tokenize=False, + add_generation_prompt=True, + chat_template=audio_chat_template) + return llm, prompt, stop_token_ids + + +model_example_map = { + "ultravox": run_ultravox, + "qwen2_audio": run_qwen2_audio, + "minicpmo": run_minicpmo +} def main(args): diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 415439e88ed59..38c2b13d3f2c7 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -265,8 +265,9 @@ def run_mantis(question: str, modality: str): # MiniCPM-V -def run_minicpmv(question: str, modality: str): - assert modality == "image" +def run_minicpmv_base(question: str, modality: str, model_name): + assert modality in ["image", "video"] + # If you want to use `MiniCPM-o-2_6` with audio inputs, check `audio_language.py` # noqa # 2.0 # The official repo doesn't work yet, so we need to use a fork for now @@ -277,7 +278,15 @@ def run_minicpmv(question: str, modality: str): # model_name = "openbmb/MiniCPM-Llama3-V-2_5" # 2.6 - model_name = "openbmb/MiniCPM-V-2_6" + # model_name = "openbmb/MiniCPM-V-2_6" + # o2.6 + + # modality supports + # 2.0: image + # 2.5: image + # 2.6: image, video + # o2.6: image, video, audio + # model_name = "openbmb/MiniCPM-o-2_6" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) llm = LLM( @@ -294,13 +303,18 @@ def run_minicpmv(question: str, modality: str): # 2.5 # stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id] - # 2.6 + # 2.6 / o2.6 stop_tokens = ['<|im_end|>', '<|endoftext|>'] stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] + modality_placeholder = { + "image": "(./)", + "video": "()", + } + messages = [{ 'role': 'user', - 'content': f'(./)\n{question}' + 'content': f'{modality_placeholder[modality]}\n{question}' }] prompt = tokenizer.apply_chat_template(messages, tokenize=False, @@ -308,6 +322,14 @@ def run_minicpmv(question: str, modality: str): return llm, prompt, stop_token_ids +def run_minicpmo(question: str, modality: str): + return run_minicpmv_base(question, modality, "openbmb/MiniCPM-o-2_6") + + +def run_minicpmv(question: str, modality: str): + return run_minicpmv_base(question, modality, "openbmb/MiniCPM-V-2_6") + + # LLama 3.2 def run_mllama(question: str, modality: str): assert modality == "image" @@ -523,6 +545,7 @@ def run_qwen2_vl(question: str, modality: str): "llava-next-video": run_llava_next_video, "llava-onevision": run_llava_onevision, "mantis": run_mantis, + "minicpmo": run_minicpmo, "minicpmv": run_minicpmv, "mllama": run_mllama, "molmo": run_molmo, diff --git a/requirements-cpu.txt b/requirements-cpu.txt index 056fbf5a7adec..ed0d2c9fae0b6 100644 --- a/requirements-cpu.txt +++ b/requirements-cpu.txt @@ -4,5 +4,6 @@ # Dependencies for CPUs torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" and platform_system != "Darwin" torch==2.5.1; platform_machine == "aarch64" or platform_system == "Darwin" +torchaudio; platform_machine != "ppc64le" # required for the image processor of minicpm-o-2_6, this must be updated alongside torch torchvision; platform_machine != "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch datasets # for benchmark scripts diff --git a/requirements-cuda.txt b/requirements-cuda.txt index 8002fbd8ee5b9..78fa360f2dc96 100644 --- a/requirements-cuda.txt +++ b/requirements-cuda.txt @@ -5,6 +5,7 @@ ray[default] >= 2.9 nvidia-ml-py >= 12.560.30 # for pynvml package torch == 2.5.1 +torchaudio==2.5.1 # These must be updated alongside torch torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.5.1 diff --git a/requirements-test.in b/requirements-test.in index bc76a91ad5356..13ad17b256734 100644 --- a/requirements-test.in +++ b/requirements-test.in @@ -12,6 +12,8 @@ decord # required for video tests einops # required for MPT, qwen-vl and Mamba httpx librosa # required for audio tests +vector_quantize_pytorch # required for minicpmo_26 test +vocos # required for minicpmo_26 test peft pqdm ray[adag]==2.40.0 @@ -19,6 +21,7 @@ sentence-transformers # required for embedding tests soundfile # required for audio tests timm # required for internvl test torch==2.5.1 +torchaudio==2.5.1 transformers_stream_generator # required for qwen-vl test matplotlib # required for qwen-vl test mistral_common[opencv] >= 1.5.0 # required for pixtral test diff --git a/requirements-test.txt b/requirements-test.txt index 09e009c2e21f4..df7e904bb0d34 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -106,9 +106,17 @@ dnspython==2.7.0 docutils==0.16 # via awscli einops==0.8.0 - # via -r requirements-test.in + # via + # -r requirements-test.in + # encodec + # vector-quantize-pytorch + # vocos +einx==0.3.0 + # via vector-quantize-pytorch email-validator==2.2.0 # via pydantic +encodec==0.1.1 + # via vocos evaluate==0.4.3 # via lm-eval fastparquet==2024.11.0 @@ -125,6 +133,8 @@ filelock==3.16.1 # triton fonttools==4.54.1 # via matplotlib +frozendict==2.4.6 + # via einx frozenlist==1.5.0 # via # aiohttp @@ -159,6 +169,7 @@ huggingface-hub==0.26.2 # timm # tokenizers # transformers + # vocos idna==3.10 # via # anyio @@ -261,6 +272,8 @@ numpy==1.26.4 # cupy-cuda12x # datasets # decord + # einx + # encodec # evaluate # fastparquet # genai-perf @@ -283,6 +296,7 @@ numpy==1.26.4 # torchvision # transformers # tritonclient + # vocos nvidia-cublas-cu12==12.4.5.8 # via # nvidia-cudnn-cu12 @@ -455,6 +469,7 @@ pyyaml==6.0.2 # responses # timm # transformers + # vocos ray[adag]==2.40.0 # via -r requirements-test.in redis==5.2.0 @@ -517,6 +532,7 @@ scipy==1.13.1 # scikit-learn # sentence-transformers # statsmodels + # vocos sentence-transformers==3.2.1 # via -r requirements-test.in sentencepiece==0.2.0 @@ -540,7 +556,9 @@ sqlitedict==2.1.0 statsmodels==0.14.4 # via genai-perf sympy==1.13.1 - # via torch + # via + # einx + # torch tabledata==1.3.3 # via pytablewriter tabulate==0.9.0 @@ -568,12 +586,21 @@ torch==2.5.1 # -r requirements-test.in # accelerate # bitsandbytes + # encodec # lm-eval # peft # sentence-transformers # tensorizer # timm + # torchaudio # torchvision + # vector-quantize-pytorch + # vocos +torchaudio==2.5.1 + # via + # -r requirements-test.in + # encodec + # vocos torchvision==0.20.1 # via timm tqdm==4.66.6 @@ -584,6 +611,7 @@ tqdm==4.66.6 # lm-eval # nltk # peft + # pqdm # sentence-transformers # tqdm-multiprocess # transformers @@ -615,6 +643,7 @@ typing-extensions==4.12.2 # huggingface-hub # librosa # mistral-common + # pqdm # pydantic # pydantic-core # torch @@ -626,6 +655,10 @@ urllib3==2.2.3 # requests # responses # tritonclient +vector-quantize-pytorch==1.21.2 + # via -r requirements-test.in +vocos==0.1.0 + # via -r requirements-test.in word2number==1.1 # via lm-eval xxhash==3.5.0 diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index d5f0d63288cc1..62c644f73d62d 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -350,6 +350,20 @@ postprocess_inputs=model_utils.wrap_inputs_post_processor, hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, ), + "minicpmo_26": VLMTestInfo( + models=["openbmb/MiniCPM-o-2_6"], + test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), + prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501 + img_idx_to_prompt=lambda idx: "(./)\n", + max_model_len=4096, + max_num_seqs=2, + get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501 + postprocess_inputs=model_utils.ignore_inputs_post_processor( + "image_sizes" + ), + hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, + patch_hf_runner=model_utils.minicpmo_patch_hf_runner + ), "minicpmv_26": VLMTestInfo( models=["openbmb/MiniCPM-V-2_6"], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py index 1ca85c7bb2056..07bdb2cee44d2 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py @@ -497,6 +497,17 @@ def _generate(self, *args, **kwargs): return hf_model +def minicpmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner: + orig_generate = hf_model.model.generate + + def _generate(self, *args, **kwargs): + return orig_generate(*args, decode_text=False, **kwargs) + + hf_model.model.generate = types.MethodType(_generate, hf_model.model) + + return hf_model + + def _generate_greedy_logprobs_limit( self, prompts: List[str], diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index b575ec6acbef3..ca28da268fa05 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -152,6 +152,8 @@ def _test_processing_correctness( "llava-hf/llava-onevision-qwen2-0.5b-ov-hf", "TIGER-Lab/Mantis-8B-siglip-llama3", "mistral-community/pixtral-12b", + "openbmb/MiniCPM-o-2_6", + "openbmb/MiniCPM-V-2_6", "Qwen/Qwen-VL-Chat", "Qwen/Qwen2-VL-2B-Instruct", "Qwen/Qwen2-Audio-7B-Instruct", diff --git a/tests/models/registry.py b/tests/models/registry.py index 0bd06dea0ec7f..7952e65aa76a5 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -245,7 +245,9 @@ def check_available_online( "LlavaOnevisionForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501 "MantisForConditionalGeneration": _HfExamplesInfo("TIGER-Lab/Mantis-8B-siglip-llama3", # noqa: E501 hf_overrides={"architectures": ["MantisForConditionalGeneration"]}), # noqa: E501 - "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5", + "MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6", + trust_remote_code=True), + "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-V-2_6", trust_remote_code=True), "MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924", trust_remote_code=True), diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 723d6e9085806..97d2561df602a 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -392,7 +392,7 @@ def _placeholder_str(self, modality: ModalityStr, if model_type == "phi3_v": # Workaround since this token is not defined in the tokenizer return f"<|image_{current_count}|>" - if model_type == "minicpmv": + if model_type in ("minicpmo", "minicpmv"): return "(./)" if model_type in ("blip-2", "chatglm", "fuyu", "paligemma", "pixtral"): @@ -424,10 +424,14 @@ def _placeholder_str(self, modality: ModalityStr, if model_type == "qwen2_audio": return (f"Audio {current_count}: " f"<|audio_bos|><|AUDIO|><|audio_eos|>") + if model_type == "minicpmo": + return "()" raise TypeError(f"Unknown model type: {model_type}") elif modality == "video": if model_type == "qwen2_vl": return "<|vision_start|><|video_pad|><|vision_end|>" + if model_type in ("minicpmo", "minicpmv"): + return "()" if model_type.startswith("llava"): return self._cached_token_str(self._tokenizer, hf_config.video_token_index) diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py new file mode 100644 index 0000000000000..eb4282d62005a --- /dev/null +++ b/vllm/model_executor/models/minicpmo.py @@ -0,0 +1,811 @@ +# Adapted from +# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only MiniCPM-O model compatible with HuggingFace weights.""" +from functools import partial +from itertools import accumulate +from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Set, + Tuple, TypedDict, Union) + +import torch +import torch.types +from torch import nn +from transformers.modeling_outputs import BaseModelOutputWithPast +from transformers.models.whisper.modeling_whisper import ( + ACT2FN, WHISPER_ATTENTION_CLASSES, WhisperConfig, WhisperEncoder) + +from vllm.attention import AttentionMetadata +from vllm.config import VllmConfig +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs +from vllm.multimodal.inputs import MultiModalFieldConfig +from vllm.multimodal.parse import (ModalityData, ModalityDataItems, + MultiModalDataItems, MultiModalDataParser, + VideoItem) +from vllm.multimodal.processing import (BaseMultiModalProcessor, + PromptReplacement) +from vllm.multimodal.profiling import ProcessorInputs +from vllm.sequence import IntermediateTensors + +from .minicpmv import (MiniCPMV2_6, MiniCPMVDummyInputsBuilder, + MiniCPMVEmbeddingItems, MiniCPMVMultiModalDataParser, + MiniCPMVMultiModalProcessor, MiniCPMVProcessingInfo) +from .utils import AutoWeightsLoader, maybe_prefix + +CPU_DEVICE = torch.device("cpu") + +MiniCPMOEmbeddingItems = MiniCPMVEmbeddingItems + + +class MiniCPMOAudioFeatureInputs(TypedDict): + type: Literal["audio_features"] + data: torch.Tensor + """ + Shape: `(batch_size * num_audios * num_slices, num_channels, length)` + Slice here means chunk. Audio that is too long will be split into slices, + which is the same as image. + Padding is used therefore `data` is `torch.Tensor`. + """ + + audio_feature_lens: torch.Tensor + """ + Shape: `(batch_size * num_audios * num_slices)` + + This should be feature length of each audio slice, + which equals to `data.shape[-1]` + """ + + audio_bounds: torch.Tensor + """ + Shape: `(batch_size * num_audios * num_slices, 2)` + + This should be in `(start, stop)` format. + """ + + +class MiniCPMOAudioEmbeddingInputs(TypedDict): + type: Literal["audio_embeds"] + data: List[torch.Tensor] + """ + Shape: `(batch_size * num_images * num_slices, hidden_size)` + + `hidden_size` must match the hidden size of language model backbone. + instead of a batched tensor. + Length of each slice may vary, so pass it as a list. + """ + audio_bounds: torch.Tensor + """ + Shape: `(batch_size * num_audios * num_slices, 2)` + + This should be in `(start, stop)` format. + """ + + +MiniCPMOAudioInputs = Union[MiniCPMOAudioFeatureInputs, + MiniCPMOAudioEmbeddingInputs] + + +class MiniCPMOAudioEmbeddingItems(MiniCPMOEmbeddingItems): + + def __init__(self, data: Dict) -> None: + super().__init__(data, "audio") + audio_embeds = self.data.get("audio_embeds", None) + if audio_embeds is None: + raise ValueError("Incorrect type of video_embeds", + "Got type: None") + self.data["audio_embeds"] = audio_embeds + + def get(self, index: int) -> object: + return self.data["audio_embeds"][index] + + +class MiniCPMOMultiModalDataParser(MiniCPMVMultiModalDataParser): + + def _parse_audio_data( + self, + data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]], + ) -> ModalityDataItems[Any, Any]: + if isinstance(data, dict): + return MiniCPMOAudioEmbeddingItems(data) + return super()._parse_audio_data(data) + + +class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo): + audio_pattern = "()" + + def get_supported_mm_modalities(self) -> List[str]: + return ["image", "video", "audio"] + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None, "video": None, "audio": None} + + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + return { + "image": self.get_max_image_tokens(), + "audio": self.get_max_audio_tokens(), + "video": self.get_max_video_tokens(seq_len) + } + + def get_default_audio_pool_step(self) -> int: + return 2 + + def get_default_audio_sampling_rate(self) -> int: + return 16000 + + def get_chunk_length(self) -> int: + return self.get_hf_config().audio_chunk_length + + def get_max_audio_tokens_per_chunk(self) -> int: + pool_step = self.get_default_audio_pool_step() + fbank_feat_in_chunk = 100 + cnn_feat_in_chunk = (fbank_feat_in_chunk - 1) // 2 + 1 + num_audio_tokens = (cnn_feat_in_chunk - pool_step) // pool_step + 1 + return num_audio_tokens + 2 # + + def get_max_audio_chunks_with_most_features(self) -> int: + return 30 + + def get_audio_len_by_num_chunks(self, num_chunks: int) -> int: + sampling_rate = self.get_default_audio_sampling_rate() + # exclude + num_tokens_per_chunk = self.get_max_audio_tokens_per_chunk() - 2 + return int(num_chunks * sampling_rate / num_tokens_per_chunk) + 1 + + def get_num_frames_with_most_features(self, seq_len: int) -> int: + mm_config = self.ctx.get_mm_config() + max_images = mm_config.limit_per_prompt.get("image", 1) + max_videos = mm_config.limit_per_prompt.get("video", 1) + max_audios = mm_config.limit_per_prompt.get("audio", 1) + + # count tokens + # which are not in get_max_image_tokens + max_image_tokens = self.get_max_image_tokens( + ) * max_images + 4 * max_images + max_audio_tokens = self.get_max_audio_tokens( + ) * max_audios + 2 * max_audios + max_total_frames = self.get_max_video_frames(seq_len - + max_image_tokens - + max_audio_tokens) + + num_frames = max(max_total_frames // max(max_videos, 1), 1) + + return num_frames + + +class MiniCPMODummyInputsBuilder(MiniCPMVDummyInputsBuilder): + + def get_dummy_processor_inputs( + self, seq_len: int, mm_counts: Mapping[str, + int]) -> ProcessorInputs: + num_audios = mm_counts.get("audio", 0) + audio_len = self.info.get_max_audio_chunks_with_most_features() * \ + self.info.get_default_audio_sampling_rate() + + processor_inputs = super().get_dummy_processor_inputs( + seq_len, mm_counts) + mm_data = { + "image": + processor_inputs.mm_data["image"], + "video": + processor_inputs.mm_data["video"], + "audio": + self._get_dummy_audios(length=audio_len, num_audios=num_audios) + } + + audio_prompt_texts = self.info.audio_pattern * num_audios + + return ProcessorInputs(prompt_text=processor_inputs.prompt_text + \ + audio_prompt_texts, + mm_data=mm_data) + + +class MiniCPMOMultiModalProcessor( + MiniCPMVMultiModalProcessor, + BaseMultiModalProcessor[MiniCPMOProcessingInfo]): + + def _get_data_parser(self) -> MultiModalDataParser: + return MiniCPMOMultiModalDataParser( + target_sr=self.info.get_default_audio_sampling_rate()) + + def get_audio_prompt_texts(self, + audio_lens: int, + chunk_input: bool = True, + chunk_length: int = 1) -> str: + return self.info.get_hf_processor().get_audio_placeholder( + audio_lens, chunk_input, chunk_length) + + def get_special_tokens(self) -> Dict[str, torch.Tensor]: + tokenizer = self.info.get_tokenizer() + special_tokens = super().get_special_tokens() + if hasattr(tokenizer, "audio_start_id"): + special_tokens["audio_start_id"] = torch.tensor( + tokenizer.audio_start_id) + special_tokens["audio_end_id"] = torch.tensor( + tokenizer.audio_end_id) + return special_tokens + + def process_audios(self, mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object]) -> Dict[str, object]: + audios = mm_data.pop("audios", []) + audio_embeds = mm_data.pop("audio_embeds", []) + if isinstance(audios, (list, torch.Tensor)) and len(audios) > 0: + audio_outputs = { + "audio_lens": [], + "audio_features": [], + "audio_feature_lens": [], + "audio_num_segments": [] + } + for audio in audios: + single_audio_outputs = super().call_base_hf_processor( + prompt=self.info.audio_pattern, + mm_data={ + "audios": audio, + "chunk_input": True + }, + mm_kwargs=mm_kwargs) + audio_outputs["audio_lens"].append(len(audio)) + audio_outputs["audio_features"].append( + single_audio_outputs["audio_features"]) + audio_outputs["audio_num_segments"].append( + len(single_audio_outputs["audio_feature_lens"][0])) + audio_outputs["audio_feature_lens"] += \ + single_audio_outputs["audio_feature_lens"] + audio_outputs["audio_features"] = [ + audio_feature for single_audio_features in \ + audio_outputs["audio_features"] + for audio_feature in single_audio_features + ] + audio_outputs["audio_feature_lens"] = torch.cat( + audio_outputs["audio_feature_lens"]) + elif len(audio_embeds): + audio_outputs = { + "audio_lens": [ + self.info.get_audio_len_by_num_chunks( + sum(chunk_embeds.shape[0] + for chunk_embeds in single_audio_embeds)) + for single_audio_embeds in audio_embeds + ], + "audio_embeds": [ + chunk_embeds for single_audio_embeds in audio_embeds + for chunk_embeds in single_audio_embeds + ], + "audio_num_segments": [ + len(single_audio_embeds) + for single_audio_embeds in audio_embeds + ] + } + else: + audio_outputs = {} + return audio_outputs + + def get_placeholder_match_pattern(self) -> str: + return r"\(<(image|video|audio)>./\)" + + def get_placeholder_split_pattern(self) -> str: + return r"\(<(?:image|video|audio)>./\)" + + def process_mm_inputs(self, mm_data, mm_kwargs) -> object: + return { + "image": self.process_images(mm_data, mm_kwargs), + "video": self.process_videos(mm_data, mm_kwargs), + "audio": self.process_audios(mm_data, mm_kwargs) + } + + def get_modality_num_counter(self, modality: str) -> str: + if modality == "audio": + return "audio_lens" + return super().get_modality_num_counter(modality) + + def get_num_slices_by_modality(self, inputs: Dict[str, object], + modality: str, index: int) -> int: + if modality == "audio": + return inputs["audio"]["audio_num_segments"][index] + return super().get_num_slices_by_modality(inputs, modality, index) + + def get_prompt_texts_by_modality(self, inputs: Dict[str, object], + modality: str, index: int) -> str: + if modality == "audio": + return self.get_audio_prompt_texts( + inputs["audio"]["audio_lens"][index]) + return super().get_prompt_texts_by_modality(inputs, modality, index) + + def _get_prompt_replacements( + self, mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, Any], + out_mm_kwargs: MultiModalKwargs) -> List[PromptReplacement]: + placeholder = { + "image": self.info.image_pattern, + "video": self.info.video_pattern, + "audio": self.info.audio_pattern + } + + def get_replacement_minicpmv(item_idx: int, modality: str): + if modality == "image": + return self.get_image_prompt_texts( + mm_items["image"].get_image_size(item_idx), item_idx) + elif modality == "video": + return self.get_video_prompt_texts( + mm_items["video"].get_frame_size(item_idx), + mm_items["video"].get_num_frames(item_idx)) + else: # audio + if isinstance(mm_items["audio"], MiniCPMOAudioEmbeddingItems): + single_audio_embeds = mm_items["audio"].get(item_idx) + audio_len = self.info.get_audio_len_by_num_chunks( + sum(chunk_embeds.shape[0] + for chunk_embeds in single_audio_embeds)) + return self.get_audio_prompt_texts(audio_len) + return self.get_audio_prompt_texts( + len(mm_items["audio"].get(item_idx))) + + return [ + PromptReplacement(modality=modality, + target=placeholder[modality], + replacement=partial(get_replacement_minicpmv, + modality=modality)) + for modality in ("image", "video", "audio") + ] + + def _get_mm_fields_config( + self, + hf_inputs, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + + def get_slices(num_slices: List[int]) -> List[int]: + slice_indices = [0] + list(accumulate(num_slices)) + slices = [(slice_indices[i], slice_indices[i + 1]) + for i in range(len(num_slices))] + return [slice(*slice_item) for slice_item in slices] + + audio_slices = get_slices( + hf_inputs.get("audio_num_slices", torch.empty(0))) + return dict( + **super()._get_mm_fields_config(hf_inputs, hf_processor_mm_kwargs), + audio_features=MultiModalFieldConfig.flat("audio", audio_slices), + audio_feature_lens=MultiModalFieldConfig.flat( + "audio", audio_slices), + audio_num_slices=MultiModalFieldConfig.batched("audio"), + audio_orders_in_mm_data=MultiModalFieldConfig.batched("audio"), + audio_embeds=MultiModalFieldConfig.flat("audio", audio_slices)) + + +class MultiModalProjector(nn.Module): + + def __init__(self, in_dim: int, out_dim: int): + super().__init__() + self.linear1 = nn.Linear(in_features=in_dim, + out_features=out_dim, + bias=True) + self.relu = nn.ReLU() + self.linear2 = nn.Linear(in_features=out_dim, + out_features=out_dim, + bias=True) + + def forward(self, audio_features: torch.Tensor) -> torch.Tensor: + hidden_states = self.relu(self.linear1(audio_features)) + hidden_states = self.linear2(hidden_states) + return hidden_states + + +class MiniCPMWhisperEncoderLayer(nn.Module): + + def __init__(self, config: WhisperConfig, layer_idx: int = None): + super().__init__() + self.embed_dim = config.d_model + self.self_attn = WHISPER_ATTENTION_CLASSES[ + config._attn_implementation]( + embed_dim=self.embed_dim, + num_heads=config.encoder_attention_heads, + dropout=config.attention_dropout, + config=config, + layer_idx=layer_idx, + ) + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim) + self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + ) -> torch.Tensor: + residual = hidden_states + past_key_values = None + hidden_states = self.self_attn_layer_norm(hidden_states) + hidden_states, attn_weights, past_key_values = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + past_key_value=past_key_values, + ) + hidden_states = nn.functional.dropout(hidden_states, + p=self.dropout, + training=self.training) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, + p=self.activation_dropout, + training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, + p=self.dropout, + training=self.training) + hidden_states = residual + hidden_states + + if hidden_states.dtype == torch.float16 and ( + torch.isinf(hidden_states).any() + or torch.isnan(hidden_states).any()): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, + min=-clamp_value, + max=clamp_value) + + outputs = (hidden_states, ) + + return outputs + + +class MiniCPMWhisperEncoder(WhisperEncoder): + + def __init__(self, config: WhisperConfig): + super().__init__(config) + self.layers = nn.ModuleList([ + MiniCPMWhisperEncoderLayer(config, layer_idx=i) + for i in range(config.encoder_layers) + ]) + + def forward( + self, + input_features: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + ) -> BaseModelOutputWithPast: + # Ignore copy + input_features = input_features.to(dtype=self.conv1.weight.dtype, + device=self.conv1.weight.device) + + inputs_embeds = nn.functional.gelu(self.conv1(input_features)) + inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds)) + + inputs_embeds = inputs_embeds.permute(0, 2, 1) + + embed_pos = self.embed_positions.weight + + embed_pos = embed_pos[:inputs_embeds.shape[1], :] + + hidden_states = inputs_embeds + embed_pos + hidden_states = nn.functional.dropout(hidden_states, + p=self.dropout, + training=self.training) + + encoder_states = () + + for idx, encoder_layer in enumerate(self.layers): + encoder_states = encoder_states + (hidden_states, ) + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + # Ignore copy + if to_drop: + layer_outputs = (None, None) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + ) + + hidden_states = layer_outputs[0] + + hidden_states = self.layer_norm(hidden_states) + encoder_states = encoder_states + (hidden_states, ) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + hidden_states=encoder_states, + ) + + +@MULTIMODAL_REGISTRY.register_processor( + MiniCPMOMultiModalProcessor, + info=MiniCPMOProcessingInfo, + dummy_inputs=MiniCPMODummyInputsBuilder) +class MiniCPMO(MiniCPMV2_6): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) + self.apm = self.init_audio_module(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "apm")) + + def init_audio_module(self, *, vllm_config: VllmConfig, prefix: str = ""): + # Do not use parameters temporarily + audio_config = self.config.audio_config + model = MiniCPMWhisperEncoder(audio_config) + audio_output_dim = int(audio_config.encoder_ffn_dim // 4) + self.audio_avg_pooler = \ + nn.AvgPool1d(self.config.audio_pool_step, + stride=self.config.audio_pool_step) + self.audio_projection_layer = \ + MultiModalProjector(in_dim=audio_output_dim,out_dim=self.embed_dim) + self.audio_encoder_layer = -1 + return model + + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: + loader = AutoWeightsLoader(self, skip_prefixes=["tts"]) + return loader.load_weights(weights) + + def subsequent_chunk_mask( + self, + size: int, + chunk_size: int, + num_left_chunks: int = -1, + device: torch.device = CPU_DEVICE, + num_lookhead: int = 0, + ) -> torch.Tensor: + ret = torch.zeros(size, size, device=device, dtype=torch.bool) + for i in range(size): + if num_left_chunks < 0: + start = 0 + else: + start = max((i // chunk_size - num_left_chunks) * chunk_size, + 0) + ending = min((i // chunk_size + 1) * chunk_size + num_lookhead, + size) + ret[i, start:ending] = True + return ret + + def _get_feat_extract_output_lengths(self, + input_lengths: torch.LongTensor): + input_lengths_after_cnn = (input_lengths - 1) // 2 + 1 + input_lengths_after_pooling = ( + input_lengths_after_cnn - + self.config.audio_pool_step) // self.config.audio_pool_step + 1 + input_lengths_after_pooling = input_lengths_after_pooling.to( + dtype=torch.int32) + + return input_lengths_after_cnn, input_lengths_after_pooling + + # Copied from HF repo of MiniCPM-o-2_6, + # designed for batched inputs and outputs + def get_audio_hidden_states(self, data: MiniCPMOAudioInputs, + chunk_length: int) -> torch.Tensor: + wavforms = data.get( + "data", + []) # (bs, 80, frames) or [], multi audios need filled in advance + audio_feature_lens_raw = [data.get("audio_feature_lens", + [])] # list, [[x1, x2], [y1], [z1]] + + # exist audio + if len(wavforms) > 0: + audio_feature_lens = torch.hstack(audio_feature_lens_raw) + batch_size, _, max_mel_seq_len = wavforms.shape + max_seq_len = (max_mel_seq_len - 1) // 2 + 1 + + # Create a sequence tensor of shape (batch_size, max_seq_len) + seq_range = (torch.arange( + 0, + max_seq_len, + dtype=audio_feature_lens.dtype, + device=audio_feature_lens.device).unsqueeze(0).expand( + batch_size, max_seq_len)) + lengths_expand = audio_feature_lens.unsqueeze(1).expand( + batch_size, max_seq_len) + # Create mask + padding_mask = seq_range >= lengths_expand # 1 for padded values + + audio_attention_mask_ = padding_mask.view( + batch_size, 1, 1, max_seq_len).expand(batch_size, 1, + max_seq_len, max_seq_len) + audio_attention_mask = audio_attention_mask_.to( + dtype=self.apm.conv1.weight.dtype, + device=self.apm.conv1.weight.device) + + if chunk_length > 0: + chunk_num_frame = int(chunk_length * 50) + chunk_mask = self.subsequent_chunk_mask( + size=max_seq_len, + chunk_size=chunk_num_frame, + num_left_chunks=-1, + device=audio_attention_mask_.device, + ) + audio_attention_mask_ = torch.logical_or( + audio_attention_mask_, torch.logical_not(chunk_mask)) + + audio_attention_mask[audio_attention_mask_] = float("-inf") + audio_states = self.apm( + wavforms, attention_mask=audio_attention_mask).hidden_states[ + self.audio_encoder_layer] + audio_embeds = self.audio_projection_layer(audio_states) + + audio_embeds = audio_embeds.transpose(1, 2) + audio_embeds = self.audio_avg_pooler(audio_embeds) + audio_embeds = audio_embeds.transpose(1, 2) + + _, feature_lens_after_pooling = \ + self._get_feat_extract_output_lengths(audio_feature_lens) + + num_audio_tokens = feature_lens_after_pooling + + final_audio_embeds = [] + idx = 0 + for i in range(len(audio_feature_lens_raw)): + target_audio_embeds = [] + for _ in range(len(audio_feature_lens_raw[i])): + target_audio_embeds.append( + audio_embeds[idx, :num_audio_tokens[idx], :]) + idx += 1 + final_audio_embeds.append(target_audio_embeds) + return final_audio_embeds + else: + return [] + + def get_embedding_with_audios(self, vlm_embedding: torch.Tensor, + audio_inputs: Optional[MiniCPMOAudioInputs], + chunk_length: int) -> torch.Tensor: + device, dtype = vlm_embedding.device, vlm_embedding.dtype + if audio_inputs["type"] == "audio_embeds": + audio_embeddings = audio_inputs["data"] + audio_embeddings = [ + audio_embeddings[i].to(device=device, dtype=dtype) + for i in range(len(audio_embeddings)) + ] + else: + audio_embeddings = self.get_audio_hidden_states( + audio_inputs, chunk_length)[0] + if audio_embeddings is None or len(audio_embeddings) == 0: + return vlm_embedding + audio_bounds = audio_inputs["audio_bounds"] + if self.config.chunk_input: + audio_embs = torch.cat(audio_embeddings, dim=0).to(device=device, + dtype=dtype) + audio_start_pos = 0 + for bound in audio_bounds: + audio_len = bound[1] - bound[0] + vlm_embedding[bound[0]:bound[1]] = audio_embs[ + audio_start_pos:audio_start_pos + audio_len, :] + audio_start_pos += audio_len + else: + for embs, bound in zip(audio_embeddings, audio_bounds): + audio_indices = torch.arange(bound[0], + bound[1], + dtype=torch.long).to(device) + + if embs.shape[0] != len(audio_indices): + raise ValueError( + "Shape mismatch: Trying to assign embeddings " + f"of shape {embs.shape} " + f"to input indices of length {len(audio_indices)}") + vlm_embedding[audio_indices] = embs.to(dtype) + return vlm_embedding + + def _get_audio_bounds(self, input_ids: torch.Tensor, + audio_start_id: torch.Tensor, + audio_end_id: torch.Tensor) -> torch.Tensor: + audio_start_tokens, = torch.where(input_ids == audio_start_id[0]) + audio_start_tokens += 1 + audio_end_tokens, = torch.where(input_ids == audio_end_id[0]) + valid_audio_nums = max(len(audio_start_tokens), len(audio_end_tokens)) + return torch.hstack([ + audio_start_tokens[:valid_audio_nums].unsqueeze(-1), + audio_end_tokens[:valid_audio_nums].unsqueeze(-1) + ]) + + def _parse_and_validate_audio_inputs( + self, input_ids: torch.Tensor, + **kwargs: object) -> Tuple[MiniCPMOAudioInputs]: + audio_features = kwargs.pop("audio_features", []) + audio_feature_lens = kwargs.pop("audio_feature_lens", []) + audio_embeds = kwargs.pop("audio_embeds", None) + audio_start_id = kwargs.pop("audio_start_id", None) + audio_end_id = kwargs.pop("audio_end_id", None) + if audio_embeds is not None: + audio_embeds = [ + audio_embeds[i][j] for i in range(len(audio_embeds)) + for j in range(len(audio_embeds[i])) + ] + return MiniCPMOAudioEmbeddingInputs( + audio_bounds=self._get_audio_bounds(input_ids, audio_start_id, + audio_end_id), + data=audio_embeds, + type="audio_embeds") + if len(audio_features) > 0: + audio_features_all = [ + i.permute(1, 0) for audio_feature in audio_features + for i in audio_feature + ] + audio_features = torch.nn.utils.rnn.pad_sequence( + audio_features_all, batch_first=True, + padding_value=0.0).permute(0, 2, 1) + audio_feature_lens = torch.cat( + [item for item in audio_feature_lens]) + + return MiniCPMOAudioFeatureInputs( + audio_bounds=self._get_audio_bounds(input_ids, audio_start_id, + audio_end_id), + data=audio_features, + audio_feature_lens=audio_feature_lens, + type="audio_features") + return None + + def _parse_and_validate_inputs(self, input_ids: torch.Tensor, + **kwargs: object): + image_inputs = self._parse_and_validate_image_inputs( + input_ids, **kwargs) + if not any("audio" in key for key in kwargs): + return image_inputs, None + audio_inputs = self._parse_and_validate_audio_inputs( + input_ids, **kwargs) + return image_inputs, audio_inputs + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + **kwargs: Any, + ) -> torch.Tensor: + if intermediate_tensors is not None: + vlm_embeddings = None + else: + image_inputs, audio_inputs = \ + self._parse_and_validate_inputs(input_ids, **kwargs) + vlm_embeddings, _ = self.get_embedding_with_vision( + input_ids, image_inputs) + + if audio_inputs is not None: + vlm_embeddings = self.get_embedding_with_audios( + vlm_embeddings, audio_inputs, + self.config.audio_chunk_length) + + # always pass the input via `inputs_embeds` + # to make sure the computation graph is consistent + # for `torch.compile` integration + input_ids = None + + output = self.llm.model( + input_ids=input_ids, + positions=positions, + kv_caches=kv_caches, + attn_metadata=attn_metadata, + intermediate_tensors=intermediate_tensors, + inputs_embeds=vlm_embeddings, + ) + return output diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 1aa529056893b..bf967d33a3176 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -22,21 +22,21 @@ """Inference-only MiniCPM-V model compatible with HuggingFace weights.""" import math import re +from collections import Counter from functools import cached_property, partial -from typing import (Any, Callable, Iterable, List, Literal, Mapping, Optional, - Set, Tuple, TypedDict, Union) +from itertools import accumulate +from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping, + Optional, Set, Tuple, TypedDict, Union) +import numpy as np import torch import torch.types from PIL import Image from torch import nn -from transformers import PretrainedConfig -from typing_extensions import NotRequired +from transformers import BatchFeature, PretrainedConfig from vllm.attention import AttentionMetadata from vllm.config import VllmConfig -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, - InputContext, token_inputs) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.resampler import (BaseResampler, Resampler2, get_2d_sincos_pos_embed) @@ -48,33 +48,30 @@ from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs -from vllm.multimodal.image import cached_get_image_processor -from vllm.multimodal.utils import cached_get_tokenizer -from vllm.sequence import IntermediateTensors, SequenceData +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalInputs, PlaceholderRange) +from vllm.multimodal.parse import (ImageItem, ImageSize, ModalityData, + ModalityDataItems, MultiModalDataItems, + MultiModalDataParser, VideoItem) +from vllm.multimodal.processing import (BaseMultiModalProcessor, + BaseProcessingInfo, PromptReplacement) +from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs +from vllm.sequence import IntermediateTensors from .idefics2_vision_model import Idefics2VisionTransformer from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP from .utils import AutoWeightsLoader, maybe_prefix -RawImageType = Union[Image.Image, torch.Tensor] - - -class MiniCPMVRawImageInput(TypedDict): - """Input mapper input with auxiliary data for computing image bounds.""" - image: RawImageType +CPU_DEVICE = torch.device("cpu") - # Image bounds token ids in 0-dim scaler tensor. - im_start_id: torch.Tensor - im_end_id: torch.Tensor - slice_start_id: NotRequired[torch.Tensor] - slice_end_id: NotRequired[torch.Tensor] +RawImageType = Union[Image.Image, torch.Tensor] class MiniCPMVImagePixelInputs(TypedDict): type: Literal["pixel_values"] data: List[torch.Tensor] """ - Shape: `(batch_size * num_images, num_channels, height, width)` + Shape: `(batch_size * num_images * num_slices, num_channels, height, width)` Note that the image size may vary, so we pass it as a list instead of a batched tensor. @@ -82,14 +79,14 @@ class MiniCPMVImagePixelInputs(TypedDict): image_bounds: torch.Tensor """ - Shape: `(batch_size * num_images, 2)` + Shape: `(batch_size * num_images * num_slices, 2)` This should be in `(start, stop)` format. """ tgt_sizes: torch.Tensor """ - Shape: `(batch_size * num_images, 2)` + Shape: `(batch_size * num_images * num_slices, 2)` This should be in `(height, width)` format. """ @@ -99,7 +96,8 @@ class MiniCPMVImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] data: torch.Tensor """ - Shape: `(batch_size * num_images, image_feature_size, hidden_size)` + Shape: `(batch_size * num_images * num_slices, + image_feature_size, hidden_size)` `hidden_size` must match the hidden size of language model backbone. instead of a batched tensor. @@ -107,7 +105,7 @@ class MiniCPMVImageEmbeddingInputs(TypedDict): image_bounds: torch.Tensor """ - Shape: `(batch_size * num_images, 2)` + Shape: `(batch_size * num_images * num_slices, 2)` This should be in `(start, stop)` format. """ @@ -116,6 +114,93 @@ class MiniCPMVImageEmbeddingInputs(TypedDict): MiniCPMVImageInputs = Union[MiniCPMVImagePixelInputs, MiniCPMVImageEmbeddingInputs] + +class MiniCPMVEmbeddingItems(ModalityDataItems[dict[str, torch.Tensor], + dict[str, torch.Tensor]]): + + def __init__(self, data: Dict, modality: str) -> None: + super().__init__(data, modality) + + def get_processor_data(self) -> Mapping[str, object]: + return self.data + + def get_passthrough_data(self) -> Mapping[str, object]: + return {} + + def get_count(self) -> int: + return len(self.data[f"{self.modality}_embeds"]) + + def get(self, index: int) -> Dict[str, torch.Tensor]: + out = {} + for k, v in self.data.items(): + out[k] = v[index] + return out + + +class MiniCPMVImageEmbeddingItems(MiniCPMVEmbeddingItems): + + def __init__(self, data: Dict) -> None: + super().__init__(data, "image") + image_embeds = self.data.get("image_embeds", None) + image_sizes = self.data.get("image_sizes", None) + if image_embeds is None: + raise ValueError("In correct type of image_embeds", + "Got type: None") + if not isinstance(image_embeds[0], torch.Tensor): + raise ValueError("In correct type of image_embeds", + f"Got type: {type(image_embeds[0])}") + if image_sizes is None: + raise ValueError( + "In correct type of image_sizes", "Got type: None." + "If you're using `image_size_list`, " + "please rename it to `image_sizes`") + if len(image_embeds[0].shape) == 2: + image_embeds = [image_embeds] + image_sizes = [image_sizes] + self.data["image_embeds"] = image_embeds + self.data["image_sizes"] = image_sizes + + def get_image_size(self, index: int) -> ImageSize: + image_size = self.data["image_sizes"][index] + return ImageSize(width=image_size[0], height=image_size[1]) + + +class MiniCPMVVideoEmbeddingItems(MiniCPMVEmbeddingItems): + + def __init__(self, data: Dict) -> None: + super().__init__(data, "video") + video_embeds = self.data.get("video_embeds", None) + image_sizes = self.data.get("image_sizes", None) + num_frames = self.data.get("num_frames", None) + if video_embeds is None: + raise ValueError("In correct type of video_embeds", + "Got type: None") + if not isinstance(video_embeds[0], torch.Tensor): + raise ValueError("In correct type of video_embeds", + f"Got type: {type(video_embeds[0])}") + if image_sizes is None: + raise ValueError( + "In correct type of image_sizes", "Got type: None." + "If you're using `image_size_list`, " + "please rename it to `image_sizes`") + if num_frames is None: + raise ValueError("In correct type of numframes", "Got type: None") + if len(video_embeds[0].shape) == 2: + video_embeds = [video_embeds] + image_sizes = [image_sizes] + num_frames = [num_frames] + self.data["video_embeds"] = video_embeds + self.data["image_sizes"] = image_sizes + self.data["num_frames"] = num_frames + + def get_frame_size(self, index: int) -> ImageSize: + frame_size = self.data["image_sizes"][index] + return ImageSize(width=frame_size[0], height=frame_size[1]) + + def get_num_frames(self, index: int) -> int: + return self.data["num_frames"][index] + + DEFAULT_LN = partial(nn.LayerNorm, eps=1e-6) @@ -212,25 +297,6 @@ def forward(self, x: torch.Tensor, return x -def _build_image_input(ctx: InputContext, - image: RawImageType) -> MiniCPMVRawImageInput: - tokenizer = cached_get_tokenizer( - ctx.model_config.tokenizer, - trust_remote_code=ctx.model_config.trust_remote_code) - if hasattr(tokenizer, "slice_start_id"): - return MiniCPMVRawImageInput( - image=image, - im_start_id=torch.tensor(tokenizer.im_start_id), - im_end_id=torch.tensor(tokenizer.im_end_id), - slice_start_id=torch.tensor(tokenizer.slice_start_id), - slice_end_id=torch.tensor(tokenizer.slice_end_id)) - else: - return MiniCPMVRawImageInput( - image=image, - im_start_id=torch.tensor(tokenizer.im_start_id), - im_end_id=torch.tensor(tokenizer.im_end_id)) - - def get_version_by_config(config: PretrainedConfig) -> Tuple[int, ...]: version_float = getattr(config, "version", None) @@ -240,129 +306,512 @@ def get_version_by_config(config: PretrainedConfig) -> Tuple[int, ...]: if config.hidden_size == 2304 and config.query_num == 64: return (2, 0) return (2, 5) - version_str = str(version_float) return tuple(int(x) for x in version_str.split(".")) -def get_max_minicpmv_image_tokens(ctx: InputContext): - hf_config = ctx.get_hf_config() - return getattr(hf_config, "query_num", 64) +class MiniCPMVMultiModalDataParser(MultiModalDataParser): + + def _parse_image_data( + self, + data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]], + ) -> ModalityDataItems[Any, Any]: + if isinstance(data, dict): + return MiniCPMVImageEmbeddingItems(data) + return super()._parse_image_data(data) + + def _parse_video_data( + self, + data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]], + ) -> ModalityDataItems[Any, Any]: + if isinstance(data, dict): + return MiniCPMVVideoEmbeddingItems(data) + return super()._parse_video_data(data) + + +class MiniCPMVProcessingInfo(BaseProcessingInfo): + image_pattern = "(./)" + video_pattern = "()" + + def get_hf_config(self): + return self.ctx.get_hf_config() + + def get_hf_processor( + self, + **kwargs: object, + ): + hf_processor = self.ctx.get_hf_processor() + return hf_processor + + def get_image_processor(self): + hf_processor = self.get_hf_processor() + image_processor = hf_processor.image_processor # type: ignore + return image_processor + + def get_model_version(self): + return get_version_by_config(self.get_hf_config()) + + def get_supported_mm_modalities(self) -> List[str]: + if self.get_model_version() == (2, 6): + return ["image", "video"] + else: + return ["image"] + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + if self.get_model_version() == (2, 6): + return {"image": None, "video": None} + else: + return {"image": None} + + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + mm_max_tokens = {"image": self.get_max_image_tokens()} + if self.get_model_version() == (2, 6): + mm_max_tokens["video"] = self.get_max_video_tokens(seq_len) + return mm_max_tokens + + def get_max_video_frame_tokens(self) -> int: + frame_size = self.get_video_frame_size_with_most_features() + return self.get_num_image_tokens(frame_size, + self.get_video_max_slice_num()) + + def get_max_video_tokens(self, seq_len: int) -> int: + return self.get_max_video_frame_tokens( + ) * self.get_num_frames_with_most_features(seq_len) + + def get_max_audio_tokens(self) -> int: + return self.get_max_audio_tokens_per_chunk( + ) * self.get_max_audio_chunks_with_most_features() + + def get_slice_query_num(self) -> int: + hf_config = self.get_hf_config() + query_num = getattr(hf_config, "query_num", 64) + return query_num + + def get_max_slice_num(self) -> int: + hf_config = self.get_hf_config() + max_slice_num = getattr(hf_config, "max_slice_num", 9) + return max_slice_num + + def get_sliced_grid(self, image_size: ImageSize, + max_slice_num: int) -> Tuple[int, int]: + if self.get_model_version() == (2, 6): + slice_grid = self.get_image_processor().get_sliced_grid( + image_size, max_slice_num) + else: + slice_grid = self.get_image_processor().get_sliced_grid(image_size) + return slice_grid + + def get_num_image_tokens(self, image_size: ImageSize, + max_slice_num: int) -> int: + slice_grid = self.get_sliced_grid(image_size, max_slice_num) + num_tokens = self.get_slice_query_num( + ) + 2 # ( * query_num) + if slice_grid is not None: + if self.get_model_version() == (2, 6): + num_additional_tokens = 0 + else: + # ( * query_num) + num_additional_tokens = 2 + num_tokens += ((self.get_slice_query_num() + 2) \ + * slice_grid[0] * slice_grid[1]) \ + + slice_grid[1] - 1 + num_additional_tokens + return num_tokens + def get_image_slice_nums(self, image_size: torch.Tensor, + max_slice_nums: int) -> int: + grid = self.get_sliced_grid(image_size, max_slice_nums) + return 1 if grid is None else grid[0] * grid[1] + 1 -def dummy_seq_data_for_minicpmv(seq_len: int, num_images: int): - return SequenceData.from_prompt_token_counts((0, seq_len)) + def get_max_image_tokens(self) -> int: + image_size = self.get_image_size_with_most_features() + return self.get_num_image_tokens(image_size, self.get_max_slice_num()) + def get_image_size_with_most_features(self) -> ImageSize: + # Result in the max possible feature size (h:w = 9:1) + return self.get_default_image_sizes(self.get_max_slice_num()) -def dummy_image_for_minicpmv(ctx: InputContext, hf_config: PretrainedConfig, - num_images: int): - width = height = hf_config.image_size - image = _build_image_input(ctx, - image=Image.new("RGB", (width, height), - color=0)) - return {"image": [image] if num_images == 1 else [image] * num_images} + def get_video_max_slice_num(self) -> int: + return 1 + def get_video_frame_size_with_most_features(self) -> ImageSize: + return self.get_default_image_sizes(self.get_video_max_slice_num()) -def dummy_data_for_minicpmv(ctx: InputContext, seq_len: int, - mm_counts: Mapping[str, int]): - hf_config = ctx.get_hf_config() - num_images = mm_counts["image"] + def get_max_video_frames(self, max_tokens: int) -> int: + num_frame_tokens = self.get_max_video_frame_tokens() + num_frames = max_tokens // num_frame_tokens + return num_frames - seq_data = dummy_seq_data_for_minicpmv(seq_len, num_images) - mm_data = dummy_image_for_minicpmv(ctx, hf_config, num_images) + def get_num_frames_with_most_features(self, seq_len: int) -> int: + mm_config = self.ctx.get_mm_config() + max_images = mm_config.limit_per_prompt.get("image", 1) + max_videos = mm_config.limit_per_prompt.get("video", 1) - return DummyData(seq_data, mm_data) + # count tokens + # which are not in get_max_image_tokens + max_image_tokens = self.get_max_image_tokens( + ) * max_images + 4 * max_images + max_total_frames = self.get_max_video_frames(seq_len - + max_image_tokens) + num_frames = max(max_total_frames // max(max_videos, 1), 1) -def input_processor_for_minicpmv(ctx: InputContext, inputs: DecoderOnlyInputs): - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "image" not in multi_modal_data: - return inputs - model_config = ctx.model_config - version = get_version_by_config(model_config.hf_config) - tokenizer = cached_get_tokenizer( - model_config.tokenizer, - trust_remote_code=model_config.trust_remote_code) - image_processor = cached_get_image_processor(model_config.tokenizer) + return num_frames - def get_placeholder(image_size: Tuple[int, int], num_image: int): + def get_default_image_sizes(self, num_slices: int) -> ImageSize: + image_size = getattr(self.get_hf_config(), "image_size", 448) + return ImageSize(width=image_size, height=image_size * num_slices) + + +class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[MiniCPMVProcessingInfo] + ): + + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + num_images = mm_counts.get("image", 0) + num_videos = mm_counts.get("video", 0) + + image_width, image_height = \ + self.info.get_image_size_with_most_features() + video_width, video_height = \ + self.info.get_video_frame_size_with_most_features() + num_video_frames = \ + self.info.get_num_frames_with_most_features(seq_len) + + mm_data = { + "image": + self._get_dummy_images(width=image_width, + height=image_height, + num_images=num_images), + "video": [ + self._get_dummy_images(width=video_width, + height=video_height, + num_images=num_video_frames) + ] * num_videos, + } + + image_prompt_texts = self.info.image_pattern * num_images + video_prompt_texts = self.info.video_pattern * num_videos + + return ProcessorInputs(prompt_text=image_prompt_texts + + video_prompt_texts, + mm_data=mm_data) + + +class MiniCPMVMultiModalProcessor( + BaseMultiModalProcessor[MiniCPMVProcessingInfo]): + + def _get_data_parser(self) -> MultiModalDataParser: + return MiniCPMVMultiModalDataParser() + + def get_slice_image_placeholder(self, image_size: ImageSize, + **kwargs) -> str: + image_processor = self.info.get_image_processor() + version = self.info.get_model_version() if version == (2, 0) or version == (2, 5): return image_processor.get_slice_image_placeholder(image_size) return image_processor.get_slice_image_placeholder( - image_size, num_image) - - prompt = inputs.get("prompt") - token_ids = inputs.get("prompt_token_ids") - if prompt is None: - prompt = tokenizer.decode(token_ids) - - pattern = "(./)" - images = multi_modal_data["image"] - image_tags = re.findall(pattern, prompt) - if len(image_tags) == 0: - new_token_ids = token_ids - new_prompt = prompt - else: - if isinstance(images, dict): - image_size_list = images.get("image_size_list") - images = [images.get("image_embeds")] + image_size, **kwargs) + + def get_image_prompt_texts(self, + image_size: ImageSize, + image_idx: int = 0) -> str: + prompt_texts = self.get_slice_image_placeholder(image_size, + image_idx=image_idx) + return prompt_texts + + def get_video_prompt_texts(self, image_size: ImageSize, + num_frames: int) -> str: + prompt_texts = "".join( + self.get_slice_image_placeholder( + image_size=image_size, + image_idx=0, + max_slice_nums=self.info.get_video_max_slice_num(), + use_image_id=False) for image_idx in range(num_frames)) + return prompt_texts + + def get_special_tokens(self) -> Dict[str, torch.Tensor]: + tokenizer = self.info.get_tokenizer() + special_tokens = { + "im_start_id": torch.tensor(tokenizer.im_start_id), + "im_end_id": torch.tensor(tokenizer.im_end_id) + } + if hasattr(tokenizer, "slice_start_id"): + special_tokens["slice_start_id"] = torch.tensor( + tokenizer.slice_start_id) + special_tokens["slice_end_id"] = torch.tensor( + tokenizer.slice_end_id) + return special_tokens + + @staticmethod + def repack_processor_outputs(outputs: Any) -> BatchFeature: + valid_keys = ["pixel_values", "image_sizes", "tgt_sizes"] + outputs = {key: outputs[key][0] for key in valid_keys} + return outputs + + def process_images(self, mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object]) -> Dict[str, object]: + images = mm_data.pop("images", []) + image_embeds = mm_data.pop("image_embeds", []) + if isinstance(images, Image.Image): + images = [images] + if isinstance(images, (list, torch.Tensor)) and len(images) > 0: + image_outputs = super()._call_hf_processor( + prompt=self.info.image_pattern * len(images), + mm_data={"images": images}, + mm_kwargs=mm_kwargs) + image_outputs = MiniCPMVMultiModalProcessor.\ + repack_processor_outputs(image_outputs) + elif len(image_embeds) > 0: + image_sizes = mm_data.pop("image_sizes", None) + image_outputs = { + "image_embeds": torch.cat(image_embeds), + "image_sizes": image_sizes + } else: - if isinstance(images, Image.Image): - images = [images] - image_size_list = [image.size for image in images] - - text_chunks = prompt.split(pattern) - new_prompt_chunks: List[str] = [] - for i in range(len(image_size_list)): - new_prompt_chunks += [ - text_chunks[i], - get_placeholder(image_size_list[i], i) - ] - new_prompt_chunks.append(text_chunks[-1]) - new_prompt = "".join(new_prompt_chunks) - new_token_ids = tokenizer.encode(new_prompt) - - multi_modal_data["image"] = [ - _build_image_input(ctx, image) for image in images - ] + image_outputs = {} + return image_outputs + + def process_videos(self, mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object]) -> Dict[str, object]: + videos = mm_data.pop("videos", []) + video_embeds = mm_data.pop("video_embeds", []) + if len(videos) > 0 and isinstance(videos[0], Image.Image): + videos = [videos] + if isinstance(videos, list) and len(videos) > 0: + video_outputs = { + "video_pixel_values": [], + "video_image_sizes": [], + "video_tgt_sizes": [], + "num_frames": [] + } + for video in videos: + parsed_video = [] + for frame in video: + if isinstance(frame, np.ndarray): + parsed_video.append(Image.fromarray(frame)) + else: + parsed_video.append(frame) + video = parsed_video + single_video_outputs = super()._call_hf_processor( + prompt=self.info.image_pattern * len(video), + mm_data={"images": video}, + mm_kwargs={ + **mm_kwargs, "max_slice_nums": + self.info.get_video_max_slice_num() + }) + video_outputs["num_frames"].append(len(video)) + for key in single_video_outputs: + if "video_" + key in video_outputs: + if key == "image_sizes": + video_outputs["video_" + key].append( + single_video_outputs[key][0][0]) + else: + video_outputs["video_" + + key] += single_video_outputs[key][0] + elif len(video_embeds): + image_sizes = mm_data.pop("image_sizes", None) + num_frames = mm_data.pop("num_frames", None) + video_outputs = { + "video_embeds": torch.cat(video_embeds), + "video_image_sizes": image_sizes, + "num_frames": num_frames + } + else: + video_outputs = {} + return video_outputs - return token_inputs( - prompt_token_ids=new_token_ids, - prompt=new_prompt, - multi_modal_data=multi_modal_data, - ) + def get_placeholder_match_pattern(self) -> str: + return r"\(<(image|video)>./\)" + def get_placeholder_split_pattern(self) -> str: + return r"\(<(?:image|video)>./\)" -def input_mapper_for_minicpmv(ctx: InputContext, data: object): - model_config = ctx.model_config + def process_mm_inputs(self, mm_data, mm_kwargs) -> object: + return { + "image": self.process_images(mm_data, mm_kwargs), + "video": self.process_videos(mm_data, mm_kwargs) + } - image_processor = cached_get_image_processor( - model_config.model, trust_remote_code=model_config.trust_remote_code) - if image_processor is None: - raise RuntimeError("No HuggingFace processor is available " - "to process the image object") + def get_input_modalities(self, mm_data) -> List[str]: + supported_mm_modalities = self.info.get_supported_mm_modalities() + input_modalities = [] + for modality in supported_mm_modalities: + if modality in mm_data and mm_data[modality] != {}: + input_modalities.append(modality) + return input_modalities + + def get_modality_num_counter(self, modality: str) -> str: + if modality == "image": + return "image_sizes" + elif modality == "video": + return "video_image_sizes" + + def get_num_slices_by_modality(self, inputs: Dict[str, object], + modality: str, index: int) -> int: + if modality == "image": + return self.info.get_image_slice_nums( + inputs[modality]["image_sizes"][index], + self.info.get_max_slice_num()) + elif modality == "video": + return self.info.get_image_slice_nums( + inputs[modality]["video_image_sizes"][index], + self.info.get_video_max_slice_num() + ) * inputs[modality]["num_frames"][index] + else: + raise ValueError(f"UnExpected modality: {modality}") + + def check_mm_inputs(self, inputs: Dict[str, object], + matches: List[str]) -> None: + counts = Counter(matches) + for modality, count in counts.items(): + if modality not in inputs or not inputs[modality]: + raise ValueError(f"None input data of {modality}." + "But prompt requires.") + counter_key = self.get_modality_num_counter(modality) + if len(inputs[modality][counter_key]) != count: + raise ValueError(f"The prompt requires {count} " + f"{modality} inputs while you pass " + f"{len(inputs[modality][counter_key])}") + + def get_prompt_texts_by_modality(self, inputs: Dict[str, object], + modality: str, index: int) -> str: + if modality == "image": + return self.get_image_prompt_texts( + inputs["image"]["image_sizes"][index], index) + elif modality == "video": + return self.get_video_prompt_texts( + inputs["video"]["video_image_sizes"][index], + inputs["video"]["num_frames"][index]) + else: + raise ValueError(f"UnExpected modality: {modality}") - if not isinstance(data, list): - raise ValueError( - "Image input must be list of MiniCPMVImageInput, got (%s)", data) + def call_base_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> BatchFeature: + return super()._call_hf_processor(prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs) + + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> BatchFeature: + # Do not support combination inputs of images and videos for now + # Try to handle interleaved multimodal data + tokenizer = self.info.get_tokenizer() + inputs = self.process_mm_inputs(mm_data, mm_kwargs) + mm_input_modalities = self.get_input_modalities(inputs) + num_mm_slices = {modality: [] for modality in mm_input_modalities} + for modality in mm_input_modalities: + num_counter_key = self.get_modality_num_counter(modality) + for index in range(len(inputs[modality][num_counter_key])): + num_mm_slices[modality].append( + self.get_num_slices_by_modality(inputs, modality, index)) + return { + "input_ids": np.array([tokenizer.encode(prompt)]), + **{ + key: value + for modality in inputs + for key, value in inputs[modality].items() + }, + **{ + f"{modality}_num_slices": num_mm_slices[modality] + for modality in mm_input_modalities + } + } - if len(data) > 0 and isinstance(data[0]['image'], torch.Tensor): - batch_data = { - "image_embeds": data[0]['image'], + def _get_prompt_replacements( + self, mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, Any], + out_mm_kwargs: MultiModalKwargs) -> List[PromptReplacement]: + placeholder = { + "image": self.info.image_pattern, + "video": self.info.video_pattern, } - else: - batch_data = image_processor \ - .preprocess([img["image"] for img in data], return_tensors="pt") \ - .data - if len(data) > 0: - batch_data["im_start_id"] = data[0]["im_start_id"] - batch_data["im_end_id"] = data[0]["im_end_id"] - if "slice_start_id" in data[0]: - batch_data["slice_start_id"] = data[0]["slice_start_id"] - batch_data["slice_end_id"] = data[0]["slice_end_id"] + def get_replacement_minicpmv(item_idx: int, modality: str): + if modality == "image": + return self.get_image_prompt_texts( + mm_items["image"].get_image_size(item_idx), item_idx) + else: # video + return self.get_video_prompt_texts( + mm_items["video"].get_frame_size(item_idx), + mm_items["video"].get_num_frames(item_idx)) + + return [ + PromptReplacement(modality=modality, + target=placeholder[modality], + replacement=partial(get_replacement_minicpmv, + modality=modality)) + for modality in ("image", "video") + ] - return MultiModalKwargs(batch_data) + def _get_mm_fields_config( + self, + hf_inputs, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + + def get_slices(num_slices: List[int]) -> List[int]: + slice_indices = [0] + list(accumulate(num_slices)) + slices = [(slice_indices[i], slice_indices[i + 1]) + for i in range(len(num_slices))] + return [slice(*slice_item) for slice_item in slices] + + image_slices = get_slices( + hf_inputs.get("image_num_slices", torch.empty(0))) + video_slices = get_slices( + hf_inputs.get("video_num_slices", torch.empty(0))) + + return dict( + pixel_values=MultiModalFieldConfig.flat("image", image_slices), + image_sizes=MultiModalFieldConfig.batched("image"), + tgt_sizes=MultiModalFieldConfig.flat("image", image_slices), + image_num_slices=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.flat("image", image_slices), + video_pixel_values=MultiModalFieldConfig.flat( + "video", video_slices), + video_image_sizes=MultiModalFieldConfig.batched("video"), + video_tgt_sizes=MultiModalFieldConfig.flat("video", video_slices), + video_embeds=MultiModalFieldConfig.flat("video", video_slices), + video_num_slices=MultiModalFieldConfig.batched("video")) + + def apply( + self, + prompt: Union[str, List[int]], + mm_data: MultiModalDataDict, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> MultiModalInputs: + supported_mm_modalities = self.info.get_supported_mm_modalities() + if isinstance(prompt, list): + prompt = self.info.get_tokenizer().decode(prompt) + matches = re.findall(self.get_placeholder_match_pattern(), prompt) + mm_orders = { + f"{modality}_orders": + torch.tensor( + [index for index, m in enumerate(matches) if m == modality]) + for modality in supported_mm_modalities + } + result = super().apply(prompt, mm_data, hf_processor_mm_kwargs) + # Exclude x from placeholders + if "image" in result["mm_placeholders"] and \ + self.info.get_model_version() == (2, 6): + result["mm_placeholders"]["image"] = [ + PlaceholderRange(offset=p["offset"] + 3 + idx // 10, + length=p["length"] - 3 - idx // 10) + for idx, p in enumerate(result["mm_placeholders"]["image"]) + ] + result["mm_kwargs"].update(**mm_orders) + result["mm_kwargs"].update(**self.get_special_tokens()) + return result class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP): @@ -409,7 +858,7 @@ def sampler(self): return get_sampler() - def get_embedding( + def get_embedding_with_vision( self, input_ids: torch.Tensor, image_inputs: Optional[MiniCPMVImageInputs], @@ -471,25 +920,46 @@ def _get_image_bounds( image_end_tokens[:valid_image_nums].unsqueeze(-1), ]) - def _parse_and_validate_inputs( + def _parse_and_validate_image_inputs( self, input_ids: torch.Tensor, **kwargs: object, ) -> Optional[MiniCPMVImageInputs]: - pixel_values = kwargs.pop("pixel_values", []) - tgt_sizes = kwargs.pop("tgt_sizes", []) + mm_data = { + "image": { + key: kwargs.pop(key, []) + for key in ["pixel_values", "tgt_sizes", "image_num_slices"] + }, + "video": { + "pixel_values": kwargs.pop("video_pixel_values", []), + "tgt_sizes": kwargs.pop("video_tgt_sizes", []), + "video_num_slices": kwargs.pop("video_num_slices", []) + } + } im_start_id = kwargs.pop("im_start_id", None) im_end_id = kwargs.pop("im_end_id", None) slice_start_id = kwargs.pop("slice_start_id", None) slice_end_id = kwargs.pop("slice_end_id", None) + mm_orders = { + f"{modality}": kwargs.pop(f"{modality}_orders", None) + for modality in ["image", "video", "audio"] + } + batch_size = max(len(mm_data["image"]["pixel_values"]), + len(mm_data["video"]["pixel_values"])) image_embeds = kwargs.pop("image_embeds", None) - + video_embeds = kwargs.pop("video_embeds", None) + if image_embeds is not None and video_embeds is not None: + raise ValueError( + "Incorrect inputs for vision embeddings. " + "Image embeds and video embeds can not exist simultaneously.") + if video_embeds is not None: + image_embeds = video_embeds if image_embeds is not None: if not isinstance(image_embeds, (torch.Tensor, list)): raise ValueError(f"Incorrect type of image embeds. " f"Got type: {type(image_embeds)}") - if isinstance(image_embeds, list): - image_embeds = torch.concat(image_embeds) + image_embeds = torch.concat( + [image_embeds[i] for i in range(len(image_embeds))]) return MiniCPMVImageEmbeddingInputs( image_bounds=self._get_image_bounds(input_ids, im_start_id, @@ -498,29 +968,47 @@ def _parse_and_validate_inputs( data=image_embeds, type="image_embeds", ) - - if not isinstance(pixel_values, (torch.Tensor, list)): - raise ValueError("Incorrect type of pixel values. " - f"Got type: {type(pixel_values)}") - - if not isinstance(tgt_sizes, (torch.Tensor, list)): - raise ValueError("Incorrect type of target sizes. " - f"Got type: {type(tgt_sizes)}") - - if len(pixel_values) != len(tgt_sizes): - raise ValueError("Inconsistent batch lengths, found: " - f"{len(pixel_values)} vs. {len(tgt_sizes)}") + for modality, modality_mm_data in mm_data.items(): + if not isinstance(modality_mm_data["pixel_values"], + (torch.Tensor, list)): + raise ValueError( + "Incorrect type of pixel values. " + f"Got type: {type(modality_mm_data['pixel_values'])}") + + if not isinstance(modality_mm_data["tgt_sizes"], + (torch.Tensor, list)): + raise ValueError( + "Incorrect type of target sizes. " + f"Got type: {type(modality_mm_data['tgt_sizes'])}") + + if len(modality_mm_data["pixel_values"]) != len( + modality_mm_data["tgt_sizes"]): + raise ValueError( + "Inconsistent batch lengths, found: " + f"{len(modality_mm_data['pixel_values'])} vs. " + f"{len(modality_mm_data['tgt_sizes'])}") pixel_values_flat: List[torch.Tensor] = [] tgt_sizes_flat: List[torch.Tensor] = [] - for pixel_b, tgt_b in zip(pixel_values, tgt_sizes): - if len(pixel_b) != len(tgt_b): - raise ValueError("Inconsistent N lengths, found: " - f"{len(pixel_b)} vs {len(tgt_b)}") - - for pixel_n, tgt_n in zip(pixel_b, tgt_b): - pixel_values_flat += pixel_n - tgt_sizes_flat += tgt_n + for b in range(batch_size): + mm_counts = {"image": 0, "video": 0} if self.version == (2, 6) \ + else {"image": 0} + mm_slice_counts = {"image": 0, "video": 0} \ + if self.version == (2, 6) else {"image": 0} + mm_orders_b = [(index, modality) for modality in mm_counts + for index in mm_orders[modality][b]] + for _, modality in sorted(mm_orders_b, key=lambda x: x[0]): + pos = mm_counts[modality] + num_slices = mm_data[modality][f"{modality}_num_slices"][b][ + pos] + slice_start_idx = mm_slice_counts[modality] + slice_end_idx = slice_start_idx + num_slices + pixel_values_flat += mm_data[modality]["pixel_values"][b][ + slice_start_idx:slice_end_idx] + tgt_sizes_flat += mm_data[modality]["tgt_sizes"][b][ + slice_start_idx:slice_end_idx] + mm_counts[modality] += 1 + mm_slice_counts[modality] += num_slices # NOTE: Input IDs does not contain image tokens during memory profiling, # so we allow it to be empty @@ -544,6 +1032,10 @@ def _parse_and_validate_inputs( type="pixel_values", ) + def _parse_and_validate_inputs(self, input_ids: torch.Tensor, + **kwargs: object): + return self._parse_and_validate_image_inputs(input_ids, **kwargs) + def forward( self, input_ids: torch.Tensor, @@ -556,9 +1048,10 @@ def forward( if intermediate_tensors is not None: vlm_embeddings = None else: - image_inputs = self._parse_and_validate_inputs(input_ids, **kwargs) - - vlm_embeddings, _ = self.get_embedding(input_ids, image_inputs) + image_inputs = \ + self._parse_and_validate_inputs(input_ids, **kwargs) + vlm_embeddings, _ = self.get_embedding_with_vision( + input_ids, image_inputs) # always pass the input via `inputs_embeds` # to make sure the computation graph is consistent @@ -964,15 +1457,15 @@ def get_vision_hidden_states(self, _SUPPORT_VERSION = { (2, 0): MiniCPMV2_0, (2, 5): MiniCPMV2_5, - (2, 6): MiniCPMV2_6 + (2, 6): MiniCPMV2_6, } -@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_minicpmv) -@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_minicpmv_image_tokens) -@INPUT_REGISTRY.register_dummy_data(dummy_data_for_minicpmv) -@INPUT_REGISTRY.register_input_processor(input_processor_for_minicpmv) -class MiniCPMV(MiniCPMVBaseModel, SupportsLoRA): +@MULTIMODAL_REGISTRY.register_processor( + MiniCPMVMultiModalProcessor, + info=MiniCPMVProcessingInfo, + dummy_inputs=MiniCPMVDummyInputsBuilder) +class MiniCPMV(MiniCPMVBaseModel, SupportsMultiModal, SupportsLoRA): """ Different versions of MiniCPMV use different visual encoders and LLMs, which is not conducive to the current integration logic of LoRA and diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 8d71b19060bf4..de05bf2b772f5 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -162,6 +162,7 @@ "LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"), # noqa: E501 "LlavaOnevisionForConditionalGeneration": ("llava_onevision", "LlavaOnevisionForConditionalGeneration"), # noqa: E501 "MantisForConditionalGeneration": ("llava", "MantisForConditionalGeneration"), # noqa: E501 + "MiniCPMO": ("minicpmo", "MiniCPMO"), "MiniCPMV": ("minicpmv", "MiniCPMV"), "MolmoForCausalLM": ("molmo", "MolmoForCausalLM"), "NVLM_D": ("nvlm_d", "NVLM_D_Model"), From ff7424f491935a1b4737bcc1570de0d616fc22f3 Mon Sep 17 00:00:00 2001 From: Yanyi Liu Date: Wed, 29 Jan 2025 17:41:01 +0800 Subject: [PATCH 34/69] [Frontend] Support override generation config in args (#12409) Signed-off-by: liuyanyi --- tests/test_config.py | 70 ++++++++++++++++++++++++++++++++++++++++ vllm/config.py | 13 ++++++-- vllm/engine/arg_utils.py | 25 ++++++++++---- 3 files changed, 100 insertions(+), 8 deletions(-) diff --git a/tests/test_config.py b/tests/test_config.py index 4518adfc31bfc..ec366b93d6a37 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -281,3 +281,73 @@ def test_uses_mrope(model_id, uses_mrope): ) assert config.uses_mrope == uses_mrope + + +def test_generation_config_loading(): + model_id = "Qwen/Qwen2.5-1.5B-Instruct" + + # When set generation_config to None, the default generation config + # will not be loaded. + model_config = ModelConfig(model_id, + task="auto", + tokenizer=model_id, + tokenizer_mode="auto", + trust_remote_code=False, + seed=0, + dtype="float16", + generation_config=None) + assert model_config.get_diff_sampling_param() == {} + + # When set generation_config to "auto", the default generation config + # should be loaded. + model_config = ModelConfig(model_id, + task="auto", + tokenizer=model_id, + tokenizer_mode="auto", + trust_remote_code=False, + seed=0, + dtype="float16", + generation_config="auto") + + correct_generation_config = { + "repetition_penalty": 1.1, + "temperature": 0.7, + "top_p": 0.8, + "top_k": 20, + } + + assert model_config.get_diff_sampling_param() == correct_generation_config + + # The generation config could be overridden by the user. + override_generation_config = {"temperature": 0.5, "top_k": 5} + + model_config = ModelConfig( + model_id, + task="auto", + tokenizer=model_id, + tokenizer_mode="auto", + trust_remote_code=False, + seed=0, + dtype="float16", + generation_config="auto", + override_generation_config=override_generation_config) + + override_result = correct_generation_config.copy() + override_result.update(override_generation_config) + + assert model_config.get_diff_sampling_param() == override_result + + # When generation_config is set to None and override_generation_config + # is set, the override_generation_config should be used directly. + model_config = ModelConfig( + model_id, + task="auto", + tokenizer=model_id, + tokenizer_mode="auto", + trust_remote_code=False, + seed=0, + dtype="float16", + generation_config=None, + override_generation_config=override_generation_config) + + assert model_config.get_diff_sampling_param() == override_generation_config diff --git a/vllm/config.py b/vllm/config.py index d7c9311ae3cb0..58464eae80b82 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -165,6 +165,8 @@ class ModelConfig: `logits_processors` extra completion argument. Defaults to None, which allows no processors. generation_config: Configuration parameter file for generation. + override_generation_config: Override the generation config with the + given config. """ def compute_hash(self) -> str: @@ -225,6 +227,7 @@ def __init__( logits_processor_pattern: Optional[str] = None, generation_config: Optional[str] = None, enable_sleep_mode: bool = False, + override_generation_config: Optional[Dict[str, Any]] = None, ) -> None: self.model = model self.tokenizer = tokenizer @@ -368,6 +371,7 @@ def __init__( self.logits_processor_pattern = logits_processor_pattern self.generation_config = generation_config + self.override_generation_config = override_generation_config or {} self._verify_quantization() self._verify_cuda_graph() @@ -904,8 +908,13 @@ def get_diff_sampling_param(self) -> Dict[str, Any]: """ if self.generation_config is None: # When generation_config is not set - return {} - config = self.try_get_generation_config() + config = {} + else: + config = self.try_get_generation_config() + + # Overriding with given generation config + config.update(self.override_generation_config) + available_params = [ "repetition_penalty", "temperature", diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ba96484e3fce9..1f203b6eaeb33 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -195,6 +195,7 @@ class EngineArgs: kv_transfer_config: Optional[KVTransferConfig] = None generation_config: Optional[str] = None + override_generation_config: Optional[Dict[str, Any]] = None enable_sleep_mode: bool = False calculate_kv_scales: Optional[bool] = None @@ -936,12 +937,23 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: type=nullable_str, default=None, help="The folder path to the generation config. " - "Defaults to None, will use the default generation config in vLLM. " - "If set to 'auto', the generation config will be automatically " - "loaded from model. If set to a folder path, the generation config " - "will be loaded from the specified folder path. If " - "`max_new_tokens` is specified, then it sets a server-wide limit " - "on the number of output tokens for all requests.") + "Defaults to None, no generation config is loaded, vLLM defaults " + "will be used. If set to 'auto', the generation config will be " + "loaded from model path. If set to a folder path, the generation " + "config will be loaded from the specified folder path. If " + "`max_new_tokens` is specified in generation config, then " + "it sets a server-wide limit on the number of output tokens " + "for all requests.") + + parser.add_argument( + "--override-generation-config", + type=json.loads, + default=None, + help="Overrides or sets generation config in JSON format. " + "e.g. ``{\"temperature\": 0.5}``. If used with " + "--generation-config=auto, the override parameters will be merged " + "with the default config from the model. If generation-config is " + "None, only the override parameters are used.") parser.add_argument("--enable-sleep-mode", action="store_true", @@ -1002,6 +1014,7 @@ def create_model_config(self) -> ModelConfig: override_pooler_config=self.override_pooler_config, logits_processor_pattern=self.logits_processor_pattern, generation_config=self.generation_config, + override_generation_config=self.override_generation_config, enable_sleep_mode=self.enable_sleep_mode, ) From b02fd288b28f0bfa2d7ac8958fe0d71ec22ffc1b Mon Sep 17 00:00:00 2001 From: Pavani Majety Date: Wed, 29 Jan 2025 01:46:12 -0800 Subject: [PATCH 35/69] [Hardware][NV] Fix Modelopt model loading for k-v-scales for Llama models. (#11787) Signed-off-by: Pavani Majety Co-authored-by: mgoin --- vllm/model_executor/model_loader/weight_utils.py | 11 ++++++++++- vllm/model_executor/models/llama.py | 9 +++++---- vllm/model_executor/models/mixtral.py | 6 +++++- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index b764a940b1742..e4d103f7cab99 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -652,9 +652,18 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]: return remapped_name possible_scale_names = [".k_scale", ".v_scale"] + modelopt_scale_names = [ + ".self_attn.k_proj.k_scale", ".self_attn.v_proj.v_scale" + ] for scale_name in possible_scale_names: if name.endswith(scale_name): - remapped_name = name.replace(scale_name, f".attn{scale_name}") + if any(mo_scale_name in name + for mo_scale_name in modelopt_scale_names): + remapped_name = name.replace( + f".self_attn.{scale_name[1]}_proj{scale_name}", + f".self_attn.attn{scale_name}") + else: + remapped_name = name.replace(scale_name, f".attn{scale_name}") if remapped_name not in params_dict: logger.warning_once( f"Found {scale_name} in the checkpoint (e.g. {name}), " diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index e214c30f5d60b..e7c264c04f1aa 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -404,6 +404,11 @@ def load_weights(self, weights: Iterable[Tuple[str, weight_loader(param, loaded_weight) loaded_params.add(scale_name) continue + if "scale" in name: + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue @@ -423,10 +428,6 @@ def load_weights(self, weights: Iterable[Tuple[str, # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue - # Remapping the name of FP8 kv-scale. - name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: - continue if is_pp_missing_parameter(name, self): continue diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index da415cdae96ed..fbb3704fa080f 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -452,7 +452,11 @@ def load_weights(self, weights: Iterable[Tuple[str, # Skip layers on other devices. if is_pp_missing_parameter(name, self): continue - + if name.endswith("scale"): + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue param = params_dict[name] weight_loader = param.weight_loader weight_loader(param, loaded_weight, shard_id) From 27b78c73cad00f5c7bb3b2431f02dc680f7034bc Mon Sep 17 00:00:00 2001 From: Jinzhen Lin Date: Wed, 29 Jan 2025 22:07:09 +0800 Subject: [PATCH 36/69] [Kernel] add triton fused moe kernel for gptq/awq (#12185) --- tests/kernels/test_moe.py | 91 ++++ .../layers/fused_moe/fused_moe.py | 407 ++++++++++++++--- .../layers/quantization/__init__.py | 7 +- .../layers/quantization/moe_wna16.py | 424 ++++++++++++++++++ 4 files changed, 874 insertions(+), 55 deletions(-) create mode 100644 vllm/model_executor/layers/quantization/moe_wna16.py diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py index 7fa5de1984452..7aa248ed1475c 100644 --- a/tests/kernels/test_moe.py +++ b/tests/kernels/test_moe.py @@ -18,6 +18,8 @@ fused_moe as iterative_moe) from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( marlin_quantize) +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + quantize_weights) from vllm.model_executor.models.mixtral import MixtralMoE from vllm.platforms import current_platform from vllm.scalar_type import scalar_types @@ -55,6 +57,95 @@ def test_fused_moe( rtol=0) +@pytest.mark.parametrize("m", [1, 32, 222]) +@pytest.mark.parametrize("n", [128, 1024, 2048]) +@pytest.mark.parametrize("k", [128, 1024]) +@pytest.mark.parametrize("e", NUM_EXPERTS) +@pytest.mark.parametrize("topk", TOP_KS) +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("group_size", [64, 128]) +@pytest.mark.parametrize("has_zp", [True, False]) +@pytest.mark.parametrize("weight_bits", [4, 8]) +def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int, + dtype: torch.dtype, group_size: int, has_zp: bool, + weight_bits: int): + print(m, n, k, e, topk, dtype, group_size, has_zp, weight_bits) + a = torch.randn((m, k), device="cuda", dtype=dtype) / 10 + w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10 + w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10 + score = torch.randn((m, e), device="cuda", dtype=dtype) + + if weight_bits == 4: + pack_factor = 2 + quant_type = scalar_types.uint4 if has_zp else scalar_types.uint4b8 + elif weight_bits == 8: + pack_factor = 1 + quant_type = scalar_types.uint8 if has_zp else scalar_types.uint8b128 + + w1_ref = w1.clone() + w2_ref = w2.clone() + w1_qweight = torch.empty((e, 2 * n, k // pack_factor), + device="cuda", + dtype=torch.uint8) + w2_qweight = torch.empty((e, k, n // pack_factor), + device="cuda", + dtype=torch.uint8) + w1_scales = torch.empty((e, 2 * n, k // group_size), + device="cuda", + dtype=dtype) + w2_scales = torch.empty((e, k, n // group_size), + device="cuda", + dtype=dtype) + w1_qzeros = torch.empty((e, 2 * n // pack_factor, k // group_size), + device="cuda", + dtype=torch.uint8) + w2_qzeros = torch.empty((e, k // pack_factor, n // group_size), + device="cuda", + dtype=torch.uint8) + + for i in range(e * 2): + expert_id = i % e + if i // e == 0: + w, w_ref, w_qweight, w_scales, w_qzeros = \ + w1, w1_ref, w1_qweight, w1_scales, w1_qzeros + else: + w, w_ref, w_qweight, w_scales, w_qzeros = \ + w2, w2_ref, w2_qweight, w2_scales, w2_qzeros + weight, qweight, scales, qzeros = quantize_weights( + w[expert_id].T, quant_type, group_size, has_zp, False) + weight = weight.T + qweight = qweight.T.contiguous().to(torch.uint8) + scales = scales.T + if has_zp: + qzeros = qzeros.T.contiguous().to(torch.uint8) + if weight_bits == 4: + qweight = qweight[:, 1::2] * 16 + qweight[:, ::2] + if has_zp: + qzeros = qzeros[1::2, :] * 16 + qzeros[::2, :] + + w_ref[expert_id] = weight + w_qweight[expert_id] = qweight + w_scales[expert_id] = scales + if has_zp: + w_qzeros[expert_id] = qzeros + + triton_output = fused_moe(a, + w1_qweight, + w2_qweight, + score, + topk, + renormalize=False, + use_int4_w4a16=weight_bits == 4, + use_int8_w8a16=weight_bits == 8, + w1_scale=w1_scales, + w2_scale=w2_scales, + w1_zp=w1_qzeros if has_zp else None, + w2_zp=w2_qzeros if has_zp else None, + block_shape=[0, group_size]) + torch_output = torch_moe(a, w1_ref, w2_ref, score, topk) + torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0) + + @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16]) @torch.inference_mode() diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 308c1d6ac6db1..dbb6c2ce4649e 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -19,6 +19,206 @@ logger = init_logger(__name__) +@triton.jit +def fused_moe_kernel_gptq_awq( + # Pointers to matrices + a_ptr, + b_ptr, + c_ptr, + b_scale_ptr, + b_zp_ptr, + topk_weights_ptr, + sorted_token_ids_ptr, + expert_ids_ptr, + num_tokens_post_padded_ptr, + # Matrix dimensions + N: tl.constexpr, + K: tl.constexpr, + EM, + num_valid_tokens, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `stride_am` is + # how much to increase `a_ptr` by to get the element one row down + # (A has M rows). + stride_am, + stride_ak, + stride_be, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + stride_bse, + stride_bsk, + stride_bsn, + stride_bze, + stride_bzk, + stride_bzn, + block_k_diviable: tl.constexpr, + group_size: tl.constexpr, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + MUL_ROUTED_WEIGHT: tl.constexpr, + top_k: tl.constexpr, + compute_type: tl.constexpr, + has_zp: tl.constexpr, + use_int4_w4a16: tl.constexpr, + use_int8_w8a16: tl.constexpr): + """ + Implements the fused computation for a Mixture of Experts (MOE) using + token and expert matrices. + + Key Parameters: + - A: The input tensor representing tokens with shape (*, K), where '*' can + be any shape representing batches and K is the feature dimension of + each token. + - B: The stacked MOE weight tensor with shape (E, N, K), where E is + the number of experts, K is the input feature dimension, and N is + the output feature dimension. + - C: The output cache tensor with shape (M, topk, N), where M is the + total number of tokens post padding, topk is the number of times + each token is repeated, and N is the output feature dimension. + - sorted_token_ids: A tensor containing the sorted indices of tokens, + repeated topk times and arranged by the expert index they are + assigned to. + - expert_ids: A tensor containing the indices of the expert for each + block. It determines which expert matrix from B should be used for + each block in A. + This kernel performs the multiplication of a token by its corresponding + expert matrix as determined by `expert_ids`. The sorting of + `sorted_token_ids` by expert index and padding ensures divisibility by + BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix + multiplication across different blocks processed by the same expert. + """ + # ----------------------------------------------------------- + # Map program ids `pid` to the block of C it should compute. + # This is done in a grouped ordering to promote L2 data reuse. + pid = tl.program_id(axis=0) + num_pid_m = tl.cdiv(EM, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + num_pid_in_group = GROUP_SIZE_M * num_pid_n + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) + pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m) + pid_n = (pid % num_pid_in_group) // group_size_m + + # ---------------------------------------------------------- + # Create pointers for the first blocks of A and B. + # We will advance this pointer as we move in the K direction + # and accumulate + # `a_ptrs` is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers + # `b_ptrs` is a block of [BLOCK_SIZE_K, BLOCK_SIZE_N] pointers + num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr) + if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded: + return + offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to( + tl.int64) + offs_token = tl.load(sorted_token_ids_ptr + offs_token_id) + token_mask = offs_token < num_valid_tokens + + offs_bn = (pid_n * BLOCK_SIZE_N + + tl.arange(0, BLOCK_SIZE_N).to(tl.int64)) % N + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am + + offs_k[None, :] * stride_ak) + + off_experts = tl.load(expert_ids_ptr + pid_m).to(tl.int64) + + if use_int4_w4a16: + b_ptrs = b_ptr + off_experts * stride_be + \ + (offs_k[:, None] // 2) * stride_bk + offs_bn[None, :] * stride_bn + b_shifter = (offs_k[:, None] % 2) * 4 + elif use_int8_w8a16: + b_ptrs = b_ptr + off_experts * stride_be + \ + offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn + + if not has_zp and use_int4_w4a16: + b_zp_num = 8 + if not has_zp and use_int8_w8a16: + b_zp_num = 128 + elif has_zp and use_int4_w4a16: + b_zp_shifter = (offs_bn[None, :] % 2) * 4 + + # ----------------------------------------------------------- + # Iterate to compute a block of the C matrix. + # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block + # of fp32 values for higher accuracy. + # `accumulator` will be converted back to fp16 after the loop. + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): + # Load the next block of A and B, generate a mask by checking the + # K dimension. + + if not block_k_diviable: + k_mask = offs_k[:, None] < K - k * BLOCK_SIZE_K + k_other = 0.0 + else: + k_mask = None + k_other = None + + a = tl.load(a_ptrs, + mask=token_mask[:, None] & + (offs_k[None, :] < K - k * BLOCK_SIZE_K), + other=0.0) + b = tl.load(b_ptrs) + if use_int4_w4a16: + b = (b >> b_shifter) & 0xF + + b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + \ + offs_bn[None, :] * stride_bsn + \ + ((offs_k[:, None] + BLOCK_SIZE_K * k) // group_size) * stride_bsk + b_scale = tl.load(b_scale_ptrs, mask=k_mask, other=k_other) + b_scale = b_scale.to(tl.float32) + + if has_zp and use_int4_w4a16: + offs_k_true = (offs_k[:, None] + BLOCK_SIZE_K * k) // group_size + b_zp_ptrs = b_zp_ptr + off_experts * stride_bze + \ + (offs_bn[None, :] // 2) * stride_bzn + \ + offs_k_true * stride_bzk + b_zp = tl.load(b_zp_ptrs, mask=k_mask, other=k_other) + b_zp = ((b_zp >> b_zp_shifter) & 0xF) + b_zp = b_zp.to(tl.float32) + elif has_zp and use_int8_w8a16: + offs_k_true = (offs_k[:, None] + BLOCK_SIZE_K * k) // group_size + b_zp_ptrs = b_zp_ptr + off_experts * stride_bze + \ + offs_bn[None, :] * stride_bzn + \ + offs_k_true * stride_bzk + b_zp = tl.load(b_zp_ptrs, mask=k_mask, other=k_other) + b_zp = b_zp.to(tl.float32) + + # We accumulate along the K dimension. + if has_zp: + b = ((b.to(tl.float32) - b_zp) * b_scale).to(compute_type) + else: + b = ((b.to(tl.float32) - b_zp_num) * b_scale).to(compute_type) + accumulator = tl.dot(a, b, acc=accumulator) + + # Advance the ptrs to the next K block. + a_ptrs += BLOCK_SIZE_K * stride_ak + if use_int4_w4a16: + b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk + else: + b_ptrs += BLOCK_SIZE_K * stride_bk + + if MUL_ROUTED_WEIGHT: + moe_weight = tl.load(topk_weights_ptr + offs_token, + mask=token_mask, + other=0) + accumulator = accumulator * moe_weight[:, None] + + accumulator = accumulator.to(compute_type) + # ----------------------------------------------------------- + # Write back the block of the output + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[ + None, :] + c_mask = token_mask[:, None] & (offs_cn[None, :] < N) + tl.store(c_ptrs, accumulator, mask=c_mask) + + @triton.jit def fused_moe_kernel( # Pointers to matrices @@ -266,6 +466,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor, C: torch.Tensor, A_scale: Optional[torch.Tensor], B_scale: Optional[torch.Tensor], + B_zp: Optional[torch.Tensor], topk_weights: torch.Tensor, topk_ids: torch.Tensor, sorted_token_ids: torch.Tensor, @@ -277,6 +478,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor, compute_type: tl.dtype, use_fp8_w8a8: bool, use_int8_w8a16: bool, + use_int4_w4a16: bool, block_shape: Optional[List[int]] = None) -> None: assert topk_weights.stride(1) == 1 assert sorted_token_ids.stride(0) == 1 @@ -292,50 +494,108 @@ def invoke_fused_moe_kernel(A: torch.Tensor, assert triton.cdiv(A.shape[-1], block_k) == A_scale.shape[-1] assert triton.cdiv(B.shape[-2], block_n) == B_scale.shape[-2] assert triton.cdiv(B.shape[-1], block_k) == B_scale.shape[-1] - elif use_int8_w8a16: + elif use_int8_w8a16 or use_int4_w4a16: assert B_scale is not None + assert block_shape is None or block_shape[0] == 0 else: assert A_scale is None assert B_scale is None - grid = lambda META: (triton.cdiv(sorted_token_ids.shape[0], META[ - 'BLOCK_SIZE_M']) * triton.cdiv(B.shape[1], META['BLOCK_SIZE_N']), ) + EM = sorted_token_ids.shape[0] + if A.shape[0] < config["BLOCK_SIZE_M"]: + # optimize for small batch_size. + # We assume that top_ids of each token is unique, so + # so num_valid_experts <= batch_size <= BLOCK_SIZE_M, + # and we can skip some invalid blocks. + EM = min(sorted_token_ids.shape[0], + A.shape[0] * top_k * config['BLOCK_SIZE_M']) + grid = lambda META: (triton.cdiv(EM, META['BLOCK_SIZE_M']) * triton.cdiv( + B.shape[1], META['BLOCK_SIZE_N']), ) + + if (use_int8_w8a16 or use_int4_w4a16) and \ + block_shape is not None and block_shape[1] > 0: + assert B_scale is not None and B_scale.ndim == 3 + assert B_zp is None or B_zp.ndim == 3 + + fused_moe_kernel_gptq_awq[grid]( + A, + B, + C, + B_scale, + B_zp, + topk_weights, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + B.shape[1], + A.shape[1], + EM, + topk_ids.numel(), + A.stride(0), + A.stride(1), + B.stride(0), + B.stride(2), + B.stride(1), + C.stride(1), + C.stride(2), + B_scale.stride(0), + B_scale.stride(2), + B_scale.stride(1), + B_zp.stride(0) if B_zp is not None else 0, + B_zp.stride(2) if B_zp is not None else 0, + B_zp.stride(1) if B_zp is not None else 0, + block_k_diviable=A.shape[1] % config["BLOCK_SIZE_K"] == 0, + group_size=block_shape[1], + MUL_ROUTED_WEIGHT=mul_routed_weight, + top_k=top_k, + compute_type=compute_type, + has_zp=B_zp is not None, + use_int4_w4a16=use_int4_w4a16, + use_int8_w8a16=use_int8_w8a16, + **config, + ) - fused_moe_kernel[grid]( - A, - B, - C, - A_scale, - B_scale, - topk_weights, - sorted_token_ids, - expert_ids, - num_tokens_post_padded, - B.shape[1], - B.shape[2], - sorted_token_ids.shape[0], - topk_ids.numel(), - A.stride(0), - A.stride(1), - B.stride(0), - B.stride(2), - B.stride(1), - C.stride(1), - C.stride(2), - A_scale.stride(0) if A_scale is not None and A_scale.ndim == 2 else 0, - A_scale.stride(1) if A_scale is not None and A_scale.ndim == 2 else 0, - B_scale.stride(0) if B_scale is not None and B_scale.ndim >= 2 else 0, - B_scale.stride(2) if B_scale is not None and B_scale.ndim == 3 else 0, - B_scale.stride(1) if B_scale is not None and B_scale.ndim >= 2 else 0, - 0 if block_shape is None else block_shape[0], - 0 if block_shape is None else block_shape[1], - MUL_ROUTED_WEIGHT=mul_routed_weight, - top_k=top_k, - compute_type=compute_type, - use_fp8_w8a8=use_fp8_w8a8, - use_int8_w8a16=use_int8_w8a16, - **config, - ) + else: + fused_moe_kernel[grid]( + A, + B, + C, + A_scale, + B_scale, + topk_weights, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + B.shape[1], + A.shape[1], + EM, + topk_ids.numel(), + A.stride(0), + A.stride(1), + B.stride(0), + B.stride(2), + B.stride(1), + C.stride(1), + C.stride(2), + A_scale.stride(0) + if A_scale is not None and A_scale.ndim == 2 else 0, + A_scale.stride(1) + if A_scale is not None and A_scale.ndim == 2 else 0, + B_scale.stride(0) + if B_scale is not None and B_scale.ndim >= 2 else 0, + B_scale.stride(2) + if B_scale is not None and B_scale.ndim == 3 else 0, + B_scale.stride(1) + if B_scale is not None and B_scale.ndim >= 2 else 0, + 0 if block_shape is None else block_shape[0], + 0 if block_shape is None else block_shape[1], + MUL_ROUTED_WEIGHT=mul_routed_weight, + top_k=top_k, + compute_type=compute_type, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a16=use_int8_w8a16, + **config, + ) def get_config_file_name(E: int, N: int, dtype: Optional[str]) -> str: @@ -432,7 +692,7 @@ def try_get_optimal_moe_config( # NOTE: For block-wise quant, # BLOCK_K must be divisible by block_shape[1] # BLOCK_N and BLOCK_M has no requirements - if block_shape is not None: + if block_shape is not None and block_shape[0] != 0: config["BLOCK_SIZE_N"] = block_shape[0] config["BLOCK_SIZE_K"] = block_shape[1] return config @@ -531,12 +791,15 @@ def grouped_topk(hidden_states: torch.Tensor, def get_config_dtype_str(dtype: torch.dtype, + use_int4_w4a16: Optional[bool] = False, use_int8_w8a16: Optional[bool] = False, use_fp8_w8a8: Optional[bool] = False): if use_fp8_w8a8: return "fp8_w8a8" elif use_int8_w8a16: return "int8_w8a16" + elif use_int4_w4a16: + return "int4_w8a16" elif dtype == torch.float: # avoiding cases where kernel fails when float32 MoE # use fp16/bfloat16 configs @@ -551,14 +814,17 @@ def inplace_fused_experts(hidden_states: torch.Tensor, topk_ids: torch.Tensor, use_fp8_w8a8: bool = False, use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, w1_scale: Optional[torch.Tensor] = None, w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, block_shape: Optional[List[int]] = None) -> None: fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, True, - use_fp8_w8a8, use_int8_w8a16, w1_scale, w2_scale, - a1_scale, a2_scale, block_shape) + use_fp8_w8a8, use_int8_w8a16, use_int4_w4a16, w1_scale, + w2_scale, w1_zp, w2_zp, a1_scale, a2_scale, block_shape) def inplace_fused_experts_fake( @@ -569,8 +835,11 @@ def inplace_fused_experts_fake( topk_ids: torch.Tensor, use_fp8_w8a8: bool = False, use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, w1_scale: Optional[torch.Tensor] = None, w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, block_shape: Optional[List[int]] = None) -> None: @@ -593,14 +862,18 @@ def outplace_fused_experts( topk_ids: torch.Tensor, use_fp8_w8a8: bool = False, use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, w1_scale: Optional[torch.Tensor] = None, w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, block_shape: Optional[List[int]] = None) -> torch.Tensor: return fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, - False, use_fp8_w8a8, use_int8_w8a16, w1_scale, - w2_scale, a1_scale, a2_scale, block_shape) + False, use_fp8_w8a8, use_int8_w8a16, + use_int4_w4a16, w1_scale, w2_scale, w1_zp, w2_zp, + a1_scale, a2_scale, block_shape) def outplace_fused_experts_fake( @@ -611,8 +884,11 @@ def outplace_fused_experts_fake( topk_ids: torch.Tensor, use_fp8_w8a8: bool = False, use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, w1_scale: Optional[torch.Tensor] = None, w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, block_shape: Optional[List[int]] = None) -> torch.Tensor: @@ -635,8 +911,11 @@ def fused_experts(hidden_states: torch.Tensor, inplace: bool = False, use_fp8_w8a8: bool = False, use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, w1_scale: Optional[torch.Tensor] = None, w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, block_shape: Optional[List[int]] = None): @@ -644,16 +923,15 @@ def fused_experts(hidden_states: torch.Tensor, torch.ops.vllm.inplace_fused_experts(hidden_states, w1, w2, topk_weights, topk_ids, use_fp8_w8a8, use_int8_w8a16, - w1_scale, w2_scale, a1_scale, + use_int4_w4a16, w1_scale, + w2_scale, w1_zp, w2_zp, a1_scale, a2_scale, block_shape) return hidden_states else: - return torch.ops.vllm.outplace_fused_experts(hidden_states, w1, w2, - topk_weights, topk_ids, - use_fp8_w8a8, - use_int8_w8a16, w1_scale, - w2_scale, a1_scale, - a2_scale, block_shape) + return torch.ops.vllm.outplace_fused_experts( + hidden_states, w1, w2, topk_weights, topk_ids, use_fp8_w8a8, + use_int8_w8a16, use_int4_w4a16, w1_scale, w2_scale, w1_zp, w2_zp, + a1_scale, a2_scale, block_shape) def fused_experts_impl(hidden_states: torch.Tensor, @@ -664,13 +942,21 @@ def fused_experts_impl(hidden_states: torch.Tensor, inplace: bool = False, use_fp8_w8a8: bool = False, use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, w1_scale: Optional[torch.Tensor] = None, w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, block_shape: Optional[List[int]] = None): # Check constraints. - assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch" + if use_int4_w4a16: + assert hidden_states.shape[1] // 2 == w1.shape[ + 2], "Hidden size mismatch" + else: + assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch" + assert topk_weights.shape == topk_ids.shape, "topk shape mismatch" assert hidden_states.is_contiguous(), "Hidden_states must be contiguous" assert w1.is_contiguous(), "Expert weights1 must be contiguous" @@ -687,6 +973,7 @@ def fused_experts_impl(hidden_states: torch.Tensor, M = min(num_tokens, CHUNK_SIZE) config_dtype = get_config_dtype_str(use_fp8_w8a8=use_fp8_w8a8, use_int8_w8a16=use_int8_w8a16, + use_int4_w4a16=use_int4_w4a16, dtype=hidden_states.dtype) get_config_func = functools.partial( @@ -755,6 +1042,7 @@ def fused_experts_impl(hidden_states: torch.Tensor, intermediate_cache1, a1_scale, w1_scale, + w1_zp, curr_topk_weights, curr_topk_ids, sorted_token_ids, @@ -766,6 +1054,7 @@ def fused_experts_impl(hidden_states: torch.Tensor, compute_type=compute_type, use_fp8_w8a8=use_fp8_w8a8, use_int8_w8a16=use_int8_w8a16, + use_int4_w4a16=use_int4_w4a16, block_shape=block_shape) torch.ops._C.silu_and_mul(intermediate_cache2, @@ -776,6 +1065,7 @@ def fused_experts_impl(hidden_states: torch.Tensor, intermediate_cache3, a2_scale, w2_scale, + w2_zp, curr_topk_weights, curr_topk_ids, sorted_token_ids, @@ -787,6 +1077,7 @@ def fused_experts_impl(hidden_states: torch.Tensor, compute_type=compute_type, use_fp8_w8a8=use_fp8_w8a8, use_int8_w8a16=use_int8_w8a16, + use_int4_w4a16=use_int4_w4a16, block_shape=block_shape) ops.moe_sum(intermediate_cache3.view(*intermediate_cache3.shape), @@ -808,8 +1099,11 @@ def fused_moe( custom_routing_function: Optional[Callable] = None, use_fp8_w8a8: bool = False, use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, w1_scale: Optional[torch.Tensor] = None, w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, block_shape: Optional[List[int]] = None, @@ -834,8 +1128,12 @@ def fused_moe( note: Deepseekv2 model uses grouped_topk - use_fp8_w8a8 (bool): If True, use fp8 arithmetic to compute the inner products for w1 and w2. Defaults to False. - - use_int8_w8a16 (bool): If True, use fp8 arithmetic to compute the inner - products for w1 and w2. Defaults to False. + - use_int8_w8a16 (bool): If True, use matmul of int8 weight and bf16/fp16 + activation to compute the inner products for w1 and w2. + Defaults to False. + - use_int4_w4a16 (bool): If True, use matmul of int4 weight and bf16/fp16 + activation to compute the inner products for w1 and w2. + Defaults to False. - w1_scale (Optional[torch.Tensor]): Optional scale to be used for w1. - w2_scale (Optional[torch.Tensor]): Optional scale to be used for @@ -873,8 +1171,11 @@ def fused_moe( inplace=inplace, use_fp8_w8a8=use_fp8_w8a8, use_int8_w8a16=use_int8_w8a16, + use_int4_w4a16=use_int4_w4a16, w1_scale=w1_scale, w2_scale=w2_scale, + w1_zp=w1_zp, + w2_zp=w2_zp, a1_scale=a1_scale, a2_scale=a2_scale, block_shape=block_shape) diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index d2bde13fcf546..bd0fd47993396 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -26,7 +26,8 @@ "experts_int8", "neuron_quant", "ipex", - "quark" + "quark", + "moe_wna16" ] # The customized quantization methods which will be added to this dict. @@ -94,6 +95,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]: from .ipex_quant import IPEXConfig from .marlin import MarlinConfig from .modelopt import ModelOptFp8Config + from .moe_wna16 import MoeWNA16Config from .neuron_quant import NeuronQuantConfig from .qqq import QQQConfig from .tpu_int8 import Int8TpuConfig @@ -121,7 +123,8 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]: "experts_int8": ExpertsInt8Config, "neuron_quant": NeuronQuantConfig, "ipex": IPEXConfig, - "quark": QuarkConfig + "quark": QuarkConfig, + "moe_wna16": MoeWNA16Config, } # Update the `method_to_config` with customized quantization methods. method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG) diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py new file mode 100644 index 0000000000000..8cd9c0a7ef253 --- /dev/null +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -0,0 +1,424 @@ +from typing import Any, Callable, Dict, List, Optional + +import torch + +from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group +from vllm.model_executor.layers.fused_moe.layer import ( + FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported) +from vllm.model_executor.layers.linear import UnquantizedLinearMethod +from vllm.model_executor.layers.quantization.awq import (AWQConfig, + AWQLinearMethod) +from vllm.model_executor.layers.quantization.awq_marlin import ( + AWQMarlinConfig, AWQMarlinLinearMethod) +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, QuantizeMethodBase) +from vllm.model_executor.layers.quantization.gptq import (GPTQConfig, + GPTQLinearMethod) +from vllm.model_executor.layers.quantization.gptq_marlin import ( + GPTQMarlinConfig, GPTQMarlinLinearMethod) +from vllm.model_executor.utils import set_weight_attrs +from vllm.platforms import current_platform + + +class MoeWNA16Config(QuantizationConfig): + """Config class for MOE WNA16 (W8A16/W4A16) quantization.""" + + def __init__(self, linear_quant_method: str, weight_bits: int, + group_size: int, has_zp: bool, lm_head_quantized: bool, + modules_to_not_convert: Optional[List[str]], + full_config: Dict[str, Any]) -> None: + self.weight_bits = weight_bits + self.group_size = group_size + self.has_zp = has_zp + self.bit8_pack_factor = 8 // self.weight_bits + self.lm_head_quantized = lm_head_quantized + self.linear_quant_method = linear_quant_method + self.full_config = full_config + self.use_marlin = False + if self.linear_quant_method == "gptq": + self.use_marlin = GPTQMarlinConfig.is_gptq_marlin_compatible( + full_config) + elif self.linear_quant_method == "awq": + capability_tuple = current_platform.get_device_capability() + device_capability = (-1 if capability_tuple is None else + capability_tuple.to_int()) + awq_min_capability = AWQConfig.get_min_capability() + if device_capability < awq_min_capability: + raise ValueError( + "The quantization method moe_wna16 + awq is not supported " + "for the current GPU. " + f"Minimum capability: {awq_min_capability}. " + f"Current capability: {device_capability}.") + self.use_marlin = AWQMarlinConfig.is_awq_marlin_compatible( + full_config) + else: + raise ValueError("moe_wna16 only support gptq and awq.") + + if modules_to_not_convert is None: + self.modules_to_not_convert = [] + else: + self.modules_to_not_convert = modules_to_not_convert + + @classmethod + def get_name(cls) -> str: + return "moe_wna16" + + @classmethod + def get_supported_act_dtypes(cls) -> List[torch.dtype]: + return [torch.bfloat16, torch.half] + + @classmethod + def get_min_capability(cls) -> int: + return 70 + + @classmethod + def get_config_filenames(cls) -> List[str]: + return ["quantize_config.json"] + + @classmethod + def from_config(cls, config: Dict[str, Any]) -> "MoeWNA16Config": + linear_quant_method = cls.get_from_keys(config, ["quant_method"]) + weight_bits = cls.get_from_keys(config, ["bits"]) + group_size = cls.get_from_keys(config, ["group_size"]) + lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], + default=False) + if linear_quant_method == "gptq": + has_zp = not cls.get_from_keys(config, ["sym"]) + modules_to_not_convert = [] + elif linear_quant_method == "awq": + has_zp = cls.get_from_keys(config, ["zero_point"]) + modules_to_not_convert = cls.get_from_keys( + config, ["modules_to_not_convert"]) + else: + raise ValueError("moe_wna16 only support gptq and awq.") + + return cls(linear_quant_method, weight_bits, group_size, has_zp, + lm_head_quantized, modules_to_not_convert, config) + + @classmethod + def override_quantization_method(cls, hf_quant_cfg, + user_quant) -> Optional[str]: + can_convert = cls.is_moe_wna16_compatible(hf_quant_cfg) + if can_convert and user_quant == "moe_wna16": + return cls.get_name() + return None + + @classmethod + def is_moe_wna16_compatible(cls, quant_config: Dict[str, Any]): + # Extract data from quant config. + quant_method = quant_config.get("quant_method", "").lower() + num_bits = quant_config.get("bits") + desc_act = quant_config.get("desc_act") + + capability_tuple = current_platform.get_device_capability() + device_capability = (-1 if capability_tuple is None else + capability_tuple.to_int()) + awq_min_capability = AWQConfig.get_min_capability() + + gptq_compatible = quant_method == "gptq" and \ + not desc_act and num_bits in [4, 8] + awq_compatible = quant_method == "awq" and num_bits == 4 and \ + device_capability >= awq_min_capability + + return gptq_compatible or awq_compatible + + def get_quant_method(self, layer: torch.nn.Module, + prefix: str) -> Optional["QuantizeMethodBase"]: + if is_layer_skipped_quant(prefix, self.modules_to_not_convert): + return UnquantizedLinearMethod() + elif isinstance(layer, FusedMoE): + return MoeWNA16Method(self) + else: + if self.linear_quant_method == "gptq": + if self.use_marlin: + return GPTQMarlinLinearMethod( + GPTQMarlinConfig.from_config(self.full_config)) + else: + return GPTQLinearMethod( + GPTQConfig.from_config(self.full_config)) + elif self.linear_quant_method == "awq": + if self.use_marlin: + return AWQMarlinLinearMethod( + AWQMarlinConfig.from_config(self.full_config)) + else: + return AWQLinearMethod( + AWQConfig.from_config(self.full_config)) + else: + raise ValueError("moe_wna16 only support gptq and awq.") + + +def is_layer_skipped_quant(prefix: str, modules_to_not_convert: List[str]): + return any(module_name in prefix for module_name in modules_to_not_convert) + + +class MoeWNA16Method(FusedMoEMethodBase): + """Linear method for MOE WNA16 (W8A16/W4A16) quantization. + + Args: + quant_config: The MOE WNA16 (W8A16/W4A16) quantization config. + """ + + def __init__(self, quant_config: MoeWNA16Config): + self.quant_config = quant_config + + def create_weights(self, layer: torch.nn.Module, num_experts: int, + hidden_size: int, intermediate_size_per_partition: int, + params_dtype: torch.dtype, **extra_weight_attrs): + + layer.quant_config = self.quant_config + bit8_pack_factor = self.quant_config.bit8_pack_factor + group_size = self.quant_config.group_size + group_size_div_factor = 1 + + # make intermediate_size and hidden_size diviable by group_size + # we reduce the group size to ensure that + # and we would repeat the loaded_weight later + while intermediate_size_per_partition % group_size or \ + hidden_size % group_size: + group_size = group_size // 2 + group_size_div_factor *= 2 + assert group_size >= 32 + layer.group_size = group_size + layer.group_size_div_factor = group_size_div_factor + + strategy = FusedMoeWeightScaleSupported.GROUP.value + extra_weight_attrs.update({ + "quant_method": strategy, + "is_transposed": False + }) + + assert 'weight_loader' in extra_weight_attrs + weight_loader = extra_weight_attrs['weight_loader'] + wrapped_weight_loader = MoeWNA16Method.get_weight_loader( + layer, weight_loader) + extra_weight_attrs['weight_loader'] = wrapped_weight_loader + + # Fused gate_up_proj (column parallel) + w13_qweight = torch.nn.Parameter(torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size // bit8_pack_factor, + dtype=torch.uint8), + requires_grad=False) + layer.register_parameter("w13_qweight", w13_qweight) + set_weight_attrs(w13_qweight, extra_weight_attrs) + + # down_proj (row parallel) + w2_qweight = torch.nn.Parameter(torch.empty( + num_experts, + hidden_size, + intermediate_size_per_partition // bit8_pack_factor, + dtype=torch.uint8), + requires_grad=False) + layer.register_parameter("w2_qweight", w2_qweight) + set_weight_attrs(w2_qweight, extra_weight_attrs) + + w13_scales = torch.nn.Parameter(torch.zeros( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size // group_size, + dtype=params_dtype), + requires_grad=False) + layer.register_parameter("w13_scales", w13_scales) + set_weight_attrs(w13_scales, extra_weight_attrs) + + w2_scales = torch.nn.Parameter(torch.zeros( + num_experts, + hidden_size, + intermediate_size_per_partition // group_size, + dtype=params_dtype), + requires_grad=False) + layer.register_parameter("w2_scales", w2_scales) + set_weight_attrs(w2_scales, extra_weight_attrs) + + if self.quant_config.has_zp: + w13_qzeros = torch.nn.Parameter(torch.zeros( + num_experts, + 2 * intermediate_size_per_partition // bit8_pack_factor, + hidden_size // group_size, + dtype=torch.uint8), + requires_grad=False) + layer.register_parameter("w13_qzeros", w13_qzeros) + set_weight_attrs(w13_qzeros, extra_weight_attrs) + + w2_qzeros = torch.nn.Parameter(torch.zeros( + num_experts, + hidden_size // bit8_pack_factor, + intermediate_size_per_partition // group_size, + dtype=torch.uint8), + requires_grad=False) + layer.register_parameter("w2_qzeros", w2_qzeros) + set_weight_attrs(w2_qzeros, extra_weight_attrs) + + if self.quant_config.linear_quant_method == "gptq": + # some param are unused, but we need to init them in order to + # load weights + invalid_param_keys = ["w13_g_idx", "w2_g_idx"] + if not self.quant_config.has_zp: + invalid_param_keys += ["w13_qzeros", "w2_qzeros"] + for key in invalid_param_keys: + param = torch.nn.Parameter(torch.empty((0, ), + dtype=torch.int32), + requires_grad=False) + layer.register_parameter(key, param) + set_weight_attrs(param, extra_weight_attrs) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + e_score_correction_bias: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + from vllm.model_executor.layers.fused_moe import fused_experts + + topk_weights, topk_ids = FusedMoE.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + e_score_correction_bias=e_score_correction_bias) + + weight_bits = self.quant_config.weight_bits + has_zp = self.quant_config.has_zp + + return fused_experts(x, + layer.w13_qweight, + layer.w2_qweight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=True, + use_int4_w4a16=weight_bits == 4, + use_int8_w8a16=weight_bits == 8, + w1_scale=layer.w13_scales, + w2_scale=layer.w2_scales, + w1_zp=layer.w13_qzeros if has_zp else None, + w2_zp=layer.w2_qzeros if has_zp else None, + block_shape=[0, layer.group_size]) + + @staticmethod + def get_weight_loader(layer, weight_loader): + + def convert_awq_tensor(tensor, tensor_type): + # convert awq qweight/qzeros to a standard format (assume int4) + # qweight: (k, n // pack_factor_bit32) -> (n, k // pack_factor_bit8) + # qzeros: (k // group_size, n // pack_factor_bit32) -> + # (n // pack_factor_bit8, k // group_size) + # pack_factor_bit32 = 32 // weight_bits + # pack_factor_bit8 = 8 // weight_bits + + # 0. suppose origin shape (a, b), dtype int32 + # 1. convert to uint8, shape (a, b) -> (a, 4 * b) + size0 = tensor.size(0) + tensor = tensor.view(torch.uint8) + + # 2. unpack to uint4 (only when weight_bits == 4) + # shape (a, 4 * b) -> (a, 4 * b, 2) + shifter = torch.tensor([0, 4], + dtype=torch.uint8, + device=tensor.device) + tensor = (tensor[:, :, None] >> shifter) & 0xF + + # 3. change order, see + # https://github.com/casper-hansen/AutoAWQ/blob/v0.2.8/awq/utils/quant_utils.py + # shape -> (a, 4 * b * pack_factor_bit8) + reverse_awq_pack_order = [0, 4, 1, 5, 2, 6, 3, 7] + tensor = tensor.view(-1, 8)[:, reverse_awq_pack_order] + tensor = tensor.view(size0, -1) + + # 4. transpose, shape -> (4 * b * pack_factor_bit8, a) + tensor = tensor.T.contiguous() + + # 5. repack (only when weight_bits == 4) + # qweight shape -> (4 * b * pack_factor_bit8, a // pack_factor_bit8) + # qzeros shape -> (4 * b, a) + + if tensor_type == "qweight": + tensor = tensor[:, 1::2] * 16 + tensor[:, ::2] + elif tensor_type == "qzeros": + tensor = tensor[1::2, :] * 16 + tensor[::2, :] + return tensor + + def convert_gptq_int4_qzeros(tensor): + tensor = tensor.view(torch.uint8) + shifter = torch.tensor([0, 4], + dtype=torch.uint8, + device=tensor.device) + tensor = (tensor[:, :, None] >> shifter) & 0xF + tensor = tensor + 1 + tensor = tensor[:, :, 0] + tensor[:, :, 1] * 16 + return tensor + + def moe_wna16_weight_loader(param: torch.nn.Parameter, + loaded_weight: torch.Tensor, + weight_name: str, shard_id: str, + expert_id: int): + if "g_idx" in weight_name: + return + if not layer.quant_config.has_zp and "qzeros" in weight_name: + return + + device = get_tp_group().device + tp_rank = get_tensor_model_parallel_rank() + loaded_weight = loaded_weight.to(device) + shard_size = layer.intermediate_size_per_partition + + # convert gptq and awq weight to a standard format + if layer.quant_config.linear_quant_method == "awq": + assert layer.quant_config.weight_bits == 4 + if "weight" in weight_name: + loaded_weight = convert_awq_tensor(loaded_weight, + "qweight") + elif "zeros" in weight_name: + loaded_weight = convert_awq_tensor(loaded_weight, "qzeros") + else: + loaded_weight = loaded_weight.T + elif layer.quant_config.linear_quant_method == "gptq": + assert layer.quant_config.weight_bits in [4, 8] + if "weight" in weight_name: + loaded_weight = loaded_weight.T.contiguous().view( + torch.uint8) + elif "zeros" in weight_name: + # add 1 to gptq qzeros to align with awq + loaded_weight = loaded_weight.view(torch.uint8) + if layer.quant_config.weight_bits == 4: + loaded_weight = convert_gptq_int4_qzeros( + loaded_weight).T + else: + loaded_weight = loaded_weight.T + 1 + else: + loaded_weight = loaded_weight.T + + # repeat the qzeros/scales to fit new group size + if layer.group_size_div_factor > 1 and \ + "qzeros" in weight_name or "scales" in weight_name: + loaded_weight = loaded_weight.repeat_interleave( + layer.group_size_div_factor, 1) + + if "w13_qzeros" in weight_name: + tensor = loaded_weight.view(layer.tp_size, -1, + loaded_weight.size(1))[tp_rank] + if shard_id == "w1": + param.data[expert_id, :shard_size // 2] = tensor + else: + param.data[expert_id, shard_size // 2:] = tensor + elif "w2_qzeros" in weight_name: + param.data[expert_id] = loaded_weight.view( + loaded_weight.size(0), layer.tp_size, -1)[:, tp_rank] + else: + weight_loader(param, loaded_weight, weight_name, shard_id, + expert_id) + + return moe_wna16_weight_loader From 73aa6cfdf789ddc67a3d2924ef52fd791554fe2a Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Wed, 29 Jan 2025 16:12:24 -0500 Subject: [PATCH 37/69] Revert "[Build/CI] Fix libcuda.so linkage" (#12552) --- CMakeLists.txt | 4 ---- 1 file changed, 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4dee9ec36895f..6c946fc5aa3ac 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -446,9 +446,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") endif() message(STATUS "Enabling C extension.") -if(VLLM_GPU_LANG STREQUAL "CUDA") - list(APPEND VLLM_C_LIBS cuda) -endif() define_gpu_extension_target( _C DESTINATION vllm @@ -457,7 +454,6 @@ define_gpu_extension_target( COMPILE_FLAGS ${VLLM_GPU_FLAGS} ARCHITECTURES ${VLLM_GPU_ARCHES} INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR} - LIBRARIES ${VLLM_C_LIBS} USE_SABI 3 WITH_SOABI) From e0cc5f259a8bec0d66ed0bc3e25ca245377679a1 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 29 Jan 2025 13:47:33 -0800 Subject: [PATCH 38/69] [V1][BugFix] Free encoder cache for aborted requests (#12545) Signed-off-by: Woosuk Kwon --- vllm/v1/core/encoder_cache_manager.py | 9 ++++++++- vllm/v1/core/scheduler.py | 14 ++++++++------ 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py index 0cd8c806a3e47..9d570b334c6cf 100644 --- a/vllm/v1/core/encoder_cache_manager.py +++ b/vllm/v1/core/encoder_cache_manager.py @@ -38,7 +38,8 @@ def allocate(self, request: Request, input_id: int) -> None: def get_cached_input_ids(self, request: Request) -> Set[int]: return self.cached.get(request.request_id, set()) - def free(self, request: Request, input_id: int) -> None: + def free_encoder_input(self, request: Request, input_id: int) -> None: + """Free a single encoder input id for the request.""" req_id = request.request_id if req_id not in self.cached: return @@ -49,6 +50,12 @@ def free(self, request: Request, input_id: int) -> None: self.num_free_slots += request.get_num_encoder_tokens(input_id) self.freed.append((req_id, input_id)) + def free(self, request: Request) -> None: + """Free all cached input ids for the request.""" + input_ids = self.get_cached_input_ids(request) + for input_id in input_ids: + self.free_encoder_input(request, input_id) + def get_freed_ids(self) -> List[Tuple[str, int]]: freed = self.freed self.freed = [] diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index 7a88cc9433b32..da2e31b1fb75b 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -202,7 +202,7 @@ def schedule(self) -> "SchedulerOutput": # which have output tokens. num_new_tokens = request.num_tokens - num_computed_tokens if num_new_tokens == 0: - # The happens when prompt length is divisible by the block + # This happens when prompt length is divisible by the block # size and all blocks are cached. Now we force to recompute # the last block. Note that we have to re-compute an entire # block because allocate_slots() assumes num_computed_tokens @@ -269,6 +269,7 @@ def schedule(self) -> "SchedulerOutput": # Get the longest common prefix among all requests in the running queue. # This can be potentially used for cascade attention. + num_common_prefix_blocks = 0 if self.running: any_request = self.running[0] num_common_prefix_blocks = ( @@ -433,7 +434,8 @@ def update_from_output( if start_pos + num_tokens <= request.num_computed_tokens: # The encoder output is already processed and stored # in the decoder's KV cache. - self.encoder_cache_manager.free(request, input_id) + self.encoder_cache_manager.free_encoder_input( + request, input_id) if request.num_computed_tokens == request.num_tokens: req_index = model_runner_output.req_id_to_index[req_id] @@ -445,8 +447,10 @@ def update_from_output( # TODO: Update the KV cache manager for prefix caching. # Check for stop and update request state. - # This must be called before me make the EngineCoreOutput. + # This must be called before we make the EngineCoreOutput. stopped = self._check_stop(request) + if stopped: + self._free_request(request) # Add EngineCoreOutput for this Request. output = EngineCoreOutput( @@ -472,7 +476,6 @@ def _check_stop(self, request: Request) -> bool: if (request.num_tokens >= self.max_model_len or request.num_output_tokens >= request.max_tokens): request.status = RequestStatus.FINISHED_LENGTH_CAPPED - self._free_request(request) return True sampling_params = request.sampling_params @@ -480,13 +483,11 @@ def _check_stop(self, request: Request) -> bool: if (not sampling_params.ignore_eos and last_token_id == request.eos_token_id): request.status = RequestStatus.FINISHED_STOPPED - self._free_request(request) return True if last_token_id in (sampling_params.stop_token_ids or ()): request.status = RequestStatus.FINISHED_STOPPED request.stop_reason = last_token_id - self._free_request(request) return True return False @@ -525,6 +526,7 @@ def finish_requests( def _free_request(self, request: Request) -> None: assert request.is_finished() self.kv_cache_manager.free(request) + self.encoder_cache_manager.free(request) self.running_reqs_data.pop(request.request_id, None) del self.requests[request.request_id] self.finished_req_ids.add(request.request_id) From 1c1bb0bbf20955d346f66bb25d349c1bd9fe6ea2 Mon Sep 17 00:00:00 2001 From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com> Date: Wed, 29 Jan 2025 18:47:30 -0600 Subject: [PATCH 39/69] [Misc][MoE] add Deepseek-V3 moe tuning support (#12558) Signed-off-by: Divakar Verma --- benchmarks/kernels/benchmark_moe.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 1fa0da75c79d2..5c8bf33afebc8 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -450,7 +450,8 @@ def save_configs(configs: Dict[int, BenchmarkConfig], num_experts: int, def main(args: argparse.Namespace): print(args) - config = AutoConfig.from_pretrained(args.model) + config = AutoConfig.from_pretrained( + args.model, trust_remote_code=args.trust_remote_code) if config.architectures[0] == "DbrxForCausalLM": E = config.ffn_config.moe_num_experts topk = config.ffn_config.moe_top_k @@ -461,6 +462,11 @@ def main(args: argparse.Namespace): topk = config.num_experts_per_tok intermediate_size = config.intermediate_size shard_intermediate_size = 2 * intermediate_size // args.tp_size + elif config.architectures[0] == "DeepseekV3ForCausalLM": + E = config.n_routed_experts + topk = config.num_experts_per_tok + intermediate_size = config.moe_intermediate_size + shard_intermediate_size = 2 * intermediate_size // args.tp_size else: # Default: Mixtral. E = config.num_local_experts @@ -538,6 +544,7 @@ def _distribute(method: str, inputs: List[Any]) -> List[Any]: parser.add_argument("--seed", type=int, default=0) parser.add_argument("--batch-size", type=int, required=False) parser.add_argument("--tune", action="store_true") + parser.add_argument("--trust-remote-code", action="store_true") args = parser.parse_args() main(args) From f17f1d46086692a2973fad94860a95799fbd8582 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Thu, 30 Jan 2025 02:31:01 +0000 Subject: [PATCH 40/69] [V1][Metrics] Add GPU cache usage % gauge (#12561) Signed-off-by: Mark McLoughlin --- tests/entrypoints/openai/test_metrics.py | 1 + vllm/v1/core/kv_cache_manager.py | 5 +++++ vllm/v1/core/scheduler.py | 1 + vllm/v1/metrics/loggers.py | 11 ++++++++++- vllm/v1/metrics/stats.py | 2 +- 5 files changed, 18 insertions(+), 2 deletions(-) diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 901ba8e8e5ef3..941f465711ef1 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -200,6 +200,7 @@ async def test_metrics_counts(server: RemoteOpenAIServer, EXPECTED_METRICS_V1 = [ "vllm:num_requests_running", "vllm:num_requests_waiting", + "vllm:gpu_cache_usage_perc", "vllm:prompt_tokens_total", "vllm:generation_tokens_total", "vllm:request_prompt_tokens_sum", diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 18fdfdfe4a010..d6c612f155f01 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -69,6 +69,11 @@ def __init__( # is finished. self.req_to_blocks: Dict[str, List[KVCacheBlock]] = {} + @property + def usage(self) -> float: + return 1.0 - (self.free_block_queue.num_free_blocks / + self.num_gpu_blocks) + def get_computed_blocks( self, request: Request) -> Tuple[List[KVCacheBlock], int]: """Get the computed (cached) blocks for the request. diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index da2e31b1fb75b..910fc4ff4d2b6 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -544,6 +544,7 @@ def make_stats(self) -> SchedulerStats: return SchedulerStats( num_running_reqs=len(self.running), num_waiting_reqs=len(self.waiting), + gpu_cache_usage=self.kv_cache_manager.usage, ) diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 9bb24d1948651..f901822c7887c 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -69,11 +69,13 @@ def log(self, scheduler_stats: SchedulerStats, logger.info( "Avg prompt throughput: %.1f tokens/s, " "Avg generation throughput: %.1f tokens/s, " - "Running: %d reqs, Waiting: %d reqs ", + "Running: %d reqs, Waiting: %d reqs " + "GPU KV cache usage: %.1f%%.", prompt_throughput, generation_throughput, scheduler_stats.num_running_reqs, scheduler_stats.num_waiting_reqs, + scheduler_stats.gpu_cache_usage * 100, ) @@ -97,6 +99,11 @@ def __init__(self, model_config: ModelConfig): documentation="Number of requests waiting to be processed.", labelnames=labelnames).labels(*labelvalues) + self.gauge_gpu_cache_usage = prometheus_client.Gauge( + name="vllm:gpu_cache_usage_perc", + documentation="GPU KV-cache usage. 1 means 100 percent usage.", + labelnames=labelnames).labels(*labelvalues) + self.counter_prompt_tokens = prometheus_client.Counter( name="vllm:prompt_tokens_total", documentation="Number of prefill tokens processed.", @@ -147,6 +154,8 @@ def log(self, scheduler_stats: SchedulerStats, self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs) self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs) + self.gauge_gpu_cache_usage.set(scheduler_stats.gpu_cache_usage) + self.counter_prompt_tokens.inc(iteration_stats.num_prompt_tokens) self.counter_generation_tokens.inc( iteration_stats.num_generation_tokens) diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index f4c276f0b6902..5277505128a63 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -14,7 +14,7 @@ class SchedulerStats: num_running_reqs: int = 0 num_waiting_reqs: int = 0 - # gpu_cache_usage: float = 0.0 + gpu_cache_usage: float = 0.0 # gpu_prefix_cache_hit_rate: float = 0.0 From a2769032ca78108e58abc45e2eb0ade8b47a6515 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 30 Jan 2025 08:05:42 +0000 Subject: [PATCH 41/69] Set `?device={device}` when changing tab in installation guides (#12560) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/source/_static/custom.js | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/docs/source/_static/custom.js b/docs/source/_static/custom.js index 18b502c786e1d..be0b2a388e404 100644 --- a/docs/source/_static/custom.js +++ b/docs/source/_static/custom.js @@ -1,3 +1,4 @@ +// Add RunLLM widget document.addEventListener("DOMContentLoaded", function () { var script = document.createElement("script"); script.type = "module"; @@ -15,4 +16,23 @@ document.addEventListener("DOMContentLoaded", function () { script.async = true; document.head.appendChild(script); - }); \ No newline at end of file + }); + +// Update URL search params when tab is clicked + document.addEventListener("DOMContentLoaded", function () { + const tabs = document.querySelectorAll(".sd-tab-label"); + + function updateURL(tab) { + const syncGroup = tab.getAttribute("data-sync-group"); + const syncId = tab.getAttribute("data-sync-id"); + if (syncGroup && syncId) { + const url = new URL(window.location); + url.searchParams.set(syncGroup, syncId); + window.history.replaceState(null, "", url); + } + } + + tabs.forEach(tab => { + tab.addEventListener("click", () => updateURL(tab)); + }); +}); From 41bf5612f590dd13fa5e5dec083849ab6cde2f70 Mon Sep 17 00:00:00 2001 From: Beim <805908499@qq.com> Date: Fri, 31 Jan 2025 04:39:22 +1300 Subject: [PATCH 42/69] [Misc] fix typo: add missing space in lora adapter error message (#12564) Signed-off-by: Beim --- vllm/entrypoints/openai/serving_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py index fc422f0917bd5..22e74b387cd73 100644 --- a/vllm/entrypoints/openai/serving_models.py +++ b/vllm/entrypoints/openai/serving_models.py @@ -203,7 +203,7 @@ async def _check_load_lora_adapter_request( for lora_request in self.lora_requests): return create_error_response( message= - f"The lora adapter '{request.lora_name}' has already been" + f"The lora adapter '{request.lora_name}' has already been " "loaded.", err_type="InvalidUserInput", status_code=HTTPStatus.BAD_REQUEST) From 9b0c4bab36c8f355f562d58521650ee8d5b6095d Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Date: Thu, 30 Jan 2025 14:53:22 -0500 Subject: [PATCH 43/69] [Kernel] Triton Configs for Fp8 Block Quantization (#11589) Signed-off-by: rshaw@neuralmagic.com Signed-off-by: mgoin Co-authored-by: mgoin Co-authored-by: simon-mo --- setup.py | 6 +- ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ .../layers/fused_moe/fused_moe.py | 91 ++++++++--- ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++ .../layers/quantization/utils/fp8_utils.py | 77 +++++++-- 43 files changed, 5972 insertions(+), 42 deletions(-) create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json diff --git a/setup.py b/setup.py index 59ece870b5585..50a2392a4d83b 100755 --- a/setup.py +++ b/setup.py @@ -608,7 +608,11 @@ def _read_requirements(filename: str) -> List[str]: ext_modules.append(CMakeExtension(name="vllm._C")) package_data = { - "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"] + "vllm": [ + "py.typed", + "model_executor/layers/fused_moe/configs/*.json", + "model_executor/layers/quantization/utils/configs/*.json", + ] } if _no_device(): diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..2e692a1583a4a --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..6fcf408755f5d --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index dbb6c2ce4649e..39607dc4ca11e 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -598,15 +598,27 @@ def invoke_fused_moe_kernel(A: torch.Tensor, ) -def get_config_file_name(E: int, N: int, dtype: Optional[str]) -> str: +# Adapted from: https://github.com/sgl-project/sglang/pull/2628 +def get_config_file_name(E: int, + N: int, + dtype: Optional[str], + block_shape: Optional[List[int]] = None) -> str: device_name = current_platform.get_device_name().replace(" ", "_") dtype_selector = "" if not dtype else f",dtype={dtype}" - return f"E={E},N={N},device_name={device_name}{dtype_selector}.json" + block_shape_selector = ("" if not block_shape or not all(block_shape) else + f",block_shape={block_shape}") + return f"E={E},N={N},device_name={device_name}{dtype_selector}{block_shape_selector}.json" # noqa: E501 +# Adapted from: https://github.com/sgl-project/sglang/pull/2628 @functools.lru_cache -def get_moe_configs(E: int, N: int, - dtype: Optional[str]) -> Optional[Dict[int, Any]]: +def get_moe_configs( + E: int, + N: int, + dtype: Optional[str], + block_n: Optional[int] = None, + block_k: Optional[int] = None, +) -> Optional[Dict[int, Any]]: """ Return optimized configurations for the fused MoE kernel. @@ -618,7 +630,8 @@ def get_moe_configs(E: int, N: int, # First look up if an optimized configuration is available in the configs # directory - json_file_name = get_config_file_name(E, N, dtype) + block_shape = [block_n, block_k] if block_n and block_k else None + json_file_name = get_config_file_name(E, N, dtype, block_shape) config_file_path = os.path.join( os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name) @@ -645,21 +658,53 @@ def get_default_config( topk: int, dtype: Optional[str], is_marlin: bool, + block_shape: Optional[List[int]] = None, ) -> Dict[str, int]: - config = { - 'BLOCK_SIZE_M': 64, - 'BLOCK_SIZE_N': 64, - 'BLOCK_SIZE_K': 32, - 'GROUP_SIZE_M': 8 - } - # A heuristic: fused marlin works faster with this config for small M - if M <= E or (is_marlin and M <= 32): + if dtype == "fp8_w8a8": + if block_shape is None: + config = { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4, + } + if M <= E: + config = { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4, + } + else: + # Block-wise quant: BLOCK_SIZE_N must be divisible by block_shape[0] + # BLOCK_SIZE_K must be divisible by block_shape[1] + config = { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": block_shape[0], + "BLOCK_SIZE_K": block_shape[1], + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3, + } + else: config = { - 'BLOCK_SIZE_M': 16, - 'BLOCK_SIZE_N': 32, - 'BLOCK_SIZE_K': 64, - 'GROUP_SIZE_M': 1 + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 8, } + # A heuristic: fused marlin works faster with this config for small M + if M <= E or (is_marlin and M <= 32): + config = { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + } return config @@ -679,7 +724,9 @@ def try_get_optimal_moe_config( else: # First try to load optimal config from the file E, _, N = w2_shape - configs = get_moe_configs(E, N, dtype) + block_n = block_shape[0] if block_shape else 0 + block_k = block_shape[1] if block_shape else 0 + configs = get_moe_configs(E, N, dtype, block_n, block_k) if configs: # If an optimal configuration map has been found, look up the @@ -688,13 +735,7 @@ def try_get_optimal_moe_config( else: # Else use the default config config = get_default_config(M, E, N, w1_shape[2], top_k, dtype, - is_marlin) - # NOTE: For block-wise quant, - # BLOCK_K must be divisible by block_shape[1] - # BLOCK_N and BLOCK_M has no requirements - if block_shape is not None and block_shape[0] != 0: - config["BLOCK_SIZE_N"] = block_shape[0] - config["BLOCK_SIZE_K"] = block_shape[1] + is_marlin, block_shape) return config diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..6496a38fba8ae --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..3618053b65831 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..46a982f5ee9a4 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..035ec027fa566 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..8b49f2781cb54 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..851bc9f9f0b50 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..d1227c2157990 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..1c61451fb34e5 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..63e661c80de6a --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..cf354037903c0 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..eccb86a76df0d --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..88af48431d8b8 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..dd069726d7ed4 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..56b939e52fac3 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..63d9a0bf5d79d --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..7fa398c15a2a5 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..f15d8f64c7090 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..cd3e07804fdec --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..9d5a329d7466a --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..96e1594a3eabb --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..5ffd367df833d --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..eabc423949a24 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..51e237b91b8e7 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..6280219c9ee7d --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..40c01c0b92b4b --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..c6fd3659799bc --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..160f12ed3f95a --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..e5c4a1d2c94e5 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..2bf5eb27e3820 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..0a1e14cffbb2a --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..15b1c93f60fc5 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..8ff12e64c172f --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..4532f93681e2b --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..ca7f32b9552b4 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..5acea242cc0ad --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..58cdd93e90b8c --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..b72e0371d1421 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000..293adce387e06 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 43b1997019107..a7a3fa6601639 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -1,12 +1,18 @@ # Adapted from https://github.com/sgl-project/sglang/pull/2575 -from typing import List, Optional, Tuple +import functools +import json +import os +from typing import Any, Dict, List, Optional, Tuple import torch import triton import triton.language as tl +from vllm.logger import init_logger from vllm.platforms import current_platform +logger = init_logger(__name__) + def apply_w8a8_block_fp8_linear( input: torch.Tensor, @@ -277,6 +283,43 @@ def _w8a8_block_fp8_matmul( tl.store(c_ptrs, c, mask=c_mask) +@functools.lru_cache +def get_w8a8_block_fp8_configs(N: int, K: int, block_n: int, + block_k: int) -> Optional[Dict[int, Any]]: + """ + Return optimized configurations for the w8a8 block fp8 kernel. + The return value will be a dictionary that maps an irregular grid of + batch sizes to configurations of the w8a8 block fp8 kernel. To evaluate the + kernel on a given batch size bs, the closest batch size in the grid should + be picked and the associated configuration chosen to invoke the kernel. + """ + + # First look up if an optimized configuration is available in the configs + # directory + device_name = current_platform.get_device_name().replace(" ", "_") + json_file_name = f"N={N},K={K},device_name={device_name},dtype=fp8_w8a8,block_shape=[{block_n}, {block_k}].json" # noqa: E501 + + config_file_path = os.path.join( + os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name) + if os.path.exists(config_file_path): + with open(config_file_path) as f: + logger.info( + "Using configuration from %s for W8A8 Block FP8 kernel.", + config_file_path, + ) + # If a configuration has been found, return it + return {int(key): val for key, val in json.load(f).items()} + + # If no optimized configuration is available, we will use the default + # configuration + logger.warning( + "Using default W8A8 Block FP8 kernel config. Performance might " + "be sub-optimal! Config file not found at %s", + config_file_path, + ) + return None + + def w8a8_block_fp8_matmul( A: torch.Tensor, B: torch.Tensor, @@ -316,17 +359,22 @@ def w8a8_block_fp8_matmul( C_shape = A.shape[:-1] + (N, ) C = A.new_empty(C_shape, dtype=output_dtype) - # TODO: - # BLOCK_SIZE_M, BLOCK_SIZE_K, BLOCK_SIZE_N can be optimized. - # BLOCK_SIZE_K must be divisible by block_k - # BLOCK_SIZE_N and BLOCK_SIZE_M has no requirements - BLOCK_SIZE_M = 128 - if M < BLOCK_SIZE_M: - BLOCK_SIZE_M = triton.next_power_of_2(M) - BLOCK_SIZE_M = max(BLOCK_SIZE_M, 16) - BLOCK_SIZE_K = block_k - assert block_k % BLOCK_SIZE_K == 0 - BLOCK_SIZE_N = block_n + configs = get_w8a8_block_fp8_configs(N, K, block_size[0], block_size[1]) + if configs: + # Get the optimal config if there is one + config = configs[min(configs.keys(), key=lambda x: abs(x - M))] + else: + # Default config + # Block-wise quant: BLOCK_SIZE_N must be divisible by block_size[0] + # BLOCK_SIZE_K must be divisible by block_size[1] + config = { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": block_size[0], + "BLOCK_SIZE_K": block_size[1], + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2, + } def grid(META): return (triton.cdiv(M, META["BLOCK_SIZE_M"]) * @@ -353,10 +401,7 @@ def grid(META): As.stride(-1), Bs.stride(1), Bs.stride(0), - BLOCK_SIZE_M=BLOCK_SIZE_M, - BLOCK_SIZE_N=BLOCK_SIZE_N, - BLOCK_SIZE_K=BLOCK_SIZE_K, - GROUP_SIZE_M=8, + **config, ) return C From bd2107e30a258a5bcaa94e678a3890ec083a60a0 Mon Sep 17 00:00:00 2001 From: Nishidha Date: Fri, 31 Jan 2025 02:59:39 +0530 Subject: [PATCH 44/69] [CPU][PPC] Updated torch, torchvision, torchaudio dependencies (#12555) Signed-off-by: npanpaliya --- Dockerfile.ppc64le | 5 ++--- requirements-cpu.txt | 12 +++++++++--- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le index d3cd1c7b313bc..c4c1f3e357972 100644 --- a/Dockerfile.ppc64le +++ b/Dockerfile.ppc64le @@ -4,12 +4,12 @@ USER root ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/" -RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev +RUN apt-get update -y && apt-get install -y git wget kmod curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev # Some packages in requirements-cpu are installed here # IBM provides optimized packages for ppc64le processors in the open-ce project for mamba # Currently these may not be available for venv or pip directly -RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 torchvision-cpu=0.16.2 rust && micromamba clean --all --yes +RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 rust && micromamba clean --all --yes COPY ./ /workspace/vllm @@ -21,7 +21,6 @@ RUN --mount=type=bind,source=.git,target=.git \ RUN --mount=type=cache,target=/root/.cache/pip \ RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \ 'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \ - torch==2.3.1 \ -r requirements-cpu.txt \ xformers uvloop==0.20.0 diff --git a/requirements-cpu.txt b/requirements-cpu.txt index ed0d2c9fae0b6..ecfa822e01186 100644 --- a/requirements-cpu.txt +++ b/requirements-cpu.txt @@ -3,7 +3,13 @@ # Dependencies for CPUs torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" and platform_system != "Darwin" -torch==2.5.1; platform_machine == "aarch64" or platform_system == "Darwin" -torchaudio; platform_machine != "ppc64le" # required for the image processor of minicpm-o-2_6, this must be updated alongside torch -torchvision; platform_machine != "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch +torch==2.5.1; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin" + +# required for the image processor of minicpm-o-2_6, this must be updated alongside torch +torchaudio; platform_machine != "ppc64le" +torchaudio==2.5.1; platform_machine == "ppc64le" + +# required for the image processor of phi3v, this must be updated alongside torch +torchvision; platform_machine != "ppc64le" +torchvision==0.20.1; platform_machine == "ppc64le" datasets # for benchmark scripts From 4078052f09f42f898b542e18d60d15a43db67a8b Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 30 Jan 2025 18:07:19 -0500 Subject: [PATCH 45/69] [V1][Log] Add max request concurrency log to V1 (#12569) Signed-off-by: mgoin --- vllm/v1/core/kv_cache_utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index bab99fe37caee..dbdda51aedaa0 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -393,6 +393,10 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig, num_blocks = num_gpu_blocks_override logger.info("# GPU blocks: %d", num_blocks) + max_concurrency = (num_blocks * vllm_config.cache_config.block_size / + vllm_config.model_config.max_model_len) + logger.info("Maximum concurrency for %s tokens per request: %.2fx", + vllm_config.model_config.max_model_len, max_concurrency) per_layer_size = page_size * num_blocks From 9798b2fb0052092a6420172e41c0c8a307eedfa6 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Thu, 30 Jan 2025 21:33:00 -0500 Subject: [PATCH 46/69] [Kernel] Update `cutlass_scaled_mm` to support 2d group (blockwise) scaling (#11868) --- CMakeLists.txt | 9 +- .../cutlass_benchmarks/w8a8_benchmarks.py | 290 ++++--- csrc/core/math.hpp | 9 +- csrc/cutlass_extensions/common.hpp | 17 + .../gemm/collective/collective_builder.hpp | 123 +++ .../gemm/collective/fp8_accumulation.hpp | 183 +++++ ..._warpspecialized_fp8_blockwise_scaling.hpp | 730 ++++++++++++++++++ .../gemm/dispatch_policy.hpp | 39 + .../vllm_collective_builder.cuh | 2 +- .../cutlass_w8a8/c3x/cutlass_gemm_caller.cuh | 93 +++ .../{scaled_mm_c3x.cuh => c3x/scaled_mm.cuh} | 74 -- .../c3x/scaled_mm_azp_sm90_int8.cu | 24 + .../c3x/scaled_mm_blockwise_sm90_fp8.cu | 24 + .../scaled_mm_blockwise_sm90_fp8_dispatch.cuh | 168 ++++ .../cutlass_w8a8/c3x/scaled_mm_kernels.hpp | 33 + .../cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu | 24 + .../scaled_mm_sm90_fp8_dispatch.cuh} | 26 +- .../cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu | 24 + .../scaled_mm_sm90_int8_dispatch.cuh} | 25 +- .../cutlass_w8a8/scaled_mm_c3x.cu | 104 ++- .../cutlass_w8a8/scaled_mm_entry.cu | 3 - .../quantization/machete/machete_mainloop.cuh | 4 + tests/kernels/test_cutlass.py | 188 +++-- tests/kernels/utils.py | 32 +- vllm/_custom_ops.py | 22 + 25 files changed, 1924 insertions(+), 346 deletions(-) create mode 100644 csrc/cutlass_extensions/gemm/collective/collective_builder.hpp create mode 100644 csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp create mode 100644 csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp create mode 100644 csrc/cutlass_extensions/gemm/dispatch_policy.hpp create mode 100644 csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh rename csrc/quantization/cutlass_w8a8/{scaled_mm_c3x.cuh => c3x/scaled_mm.cuh} (51%) create mode 100644 csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu create mode 100644 csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu create mode 100644 csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh create mode 100644 csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp create mode 100644 csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu rename csrc/quantization/cutlass_w8a8/{scaled_mm_c3x_sm90_fp8_dispatch.cuh => c3x/scaled_mm_sm90_fp8_dispatch.cuh} (76%) create mode 100644 csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu rename csrc/quantization/cutlass_w8a8/{scaled_mm_c3x_sm90_int8_dispatch.cuh => c3x/scaled_mm_sm90_int8_dispatch.cuh} (84%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6c946fc5aa3ac..c823c9ff895c3 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -245,7 +245,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") FetchContent_Declare( cutlass GIT_REPOSITORY https://github.com/nvidia/cutlass.git - GIT_TAG v3.6.0 + GIT_TAG v3.7.0 GIT_PROGRESS TRUE # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history. @@ -299,7 +299,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # CUDA 12.0 or later (and only work on Hopper, 9.0a for now). cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS}") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS) - set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu") + set(SRCS + "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu" + "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu" + "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu" + "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu" + "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu") set_gencode_flags_for_srcs( SRCS "${SRCS}" CUDA_ARCHS "${SCALED_MM_3X_ARCHS}") diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py index d0353bc8cb42a..b87496ca3b2b4 100644 --- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py @@ -3,7 +3,7 @@ import itertools import pickle as pkl import time -from typing import Callable, Iterable, List, Tuple +from typing import Callable, Iterable, List, Optional, Tuple import torch import torch.utils.benchmark as TBenchmark @@ -12,6 +12,8 @@ from weight_shapes import WEIGHT_SHAPES from vllm import _custom_ops as ops +from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + w8a8_block_fp8_matmul) from vllm.utils import FlexibleArgumentParser DEFAULT_MODELS = list(WEIGHT_SHAPES.keys()) @@ -38,8 +40,15 @@ def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args, ).blocked_autorange(min_run_time=min_run_time) -def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str, - sub_label: str) -> Iterable[TMeasurement]: +def bench_int8( + dtype: torch.dtype, + m: int, + k: int, + n: int, + label: str, + sub_label: str, + bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]: + """Benchmark INT8-based kernels.""" assert dtype == torch.int8 a, b = make_rand_tensors(torch.int8, m, n, k) scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32) @@ -48,155 +57,132 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str, azp = torch.zeros((m, ), device="cuda", dtype=torch.int32) azp_adj = torch.zeros((n, ), device="cuda", dtype=torch.int32) + bench_fns = { + "pytorch_bf16_bf16_bf16_matmul-no-scales": + lambda: torch.mm(a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16) + ), + "pytorch_fp16_fp16_fp16_matmul-no-scales": + lambda: torch.mm(a.to(dtype=torch.float16), b.to(dtype=torch.float16)), + "cutlass_i8_i8_bf16_scaled_mm": + lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16), + "cutlass_i8_i8_bf16_scaled_mm_bias": + lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16, + bias), + "cutlass_i8_i8_bf16_scaled_mm_azp": + lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch. + bfloat16, azp_adj), + "cutlass_i8_i8_bf16_scaled_mm_azp_bias": + lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch. + bfloat16, azp_adj, None, bias), + "cutlass_i8_i8_bf16_scaled_mm_azp_pt": + lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch. + bfloat16, azp_adj, azp), + "cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias": + lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch. + bfloat16, azp_adj, azp, bias), + } + timers = [] - # pytorch impl - bfloat16 - timers.append( - bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales", - torch.mm, a.to(dtype=torch.bfloat16), - b.to(dtype=torch.bfloat16))) - - # pytorch impl - float16 - timers.append( - bench_fn(label, sub_label, - "pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm, - a.to(dtype=torch.float16), b.to(dtype=torch.float16))) - - # cutlass impl - timers.append( - bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm", - ops.cutlass_scaled_mm, a, b, scale_a, scale_b, - torch.bfloat16)) - - # cutlass with bias - timers.append( - bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias", - ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16, - bias)) - - # cutlass with azp per-tensor - timers.append( - bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp", - ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b, - torch.bfloat16, azp_adj)) - - # cutlass with azp per-tensor + bias - timers.append( - bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_bias", - ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b, - torch.bfloat16, azp_adj, None, bias)) - - # cutlass with azp per-token - timers.append( - bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt", - ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b, - torch.bfloat16, azp_adj, azp)) - - # cutlass with azp per-token + bias - timers.append( - bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias", - ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b, - torch.bfloat16, azp_adj, azp, bias)) + for name, fn in bench_fns.items(): + # If bench_kernels is None, run all. Otherwise, run only exact matches. + if bench_kernels is None or name in bench_kernels: + print(f"Running {name}") + timers.append(bench_fn(label, sub_label, name, fn)) return timers -def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str, - sub_label: str) -> Iterable[TMeasurement]: +def bench_fp8( + dtype: torch.dtype, + m: int, + k: int, + n: int, + label: str, + sub_label: str, + bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]: + """Benchmark FP8-based kernels.""" assert dtype == torch.float8_e4m3fn a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k) + a_cont = a.contiguous() scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32) scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32) + block_scale_a = torch.rand((m, k // 128), + device="cuda", + dtype=torch.float32) + block_scale_b = torch.rand((k // 128, n // 128), + device="cuda", + dtype=torch.float32) + block_scale_a_M_major = block_scale_a.t().contiguous().t() + block_scale_b_K_major = block_scale_b.t().contiguous().t() bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16) - timers = [] + print(m, k, n) + + bench_fns = { + "pytorch_bf16_bf16_bf16_matmul-no-scales": + lambda: torch.mm(a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16) + ), + "pytorch_fp16_fp16_fp16_matmul-no-scales": + lambda: torch.mm(a.to(dtype=torch.float16), b.to(dtype=torch.float16)), + "pytorch_fp8_fp8_fp16_scaled_mm": + lambda: torch._scaled_mm( + a, b, scale_a, scale_b, out_dtype=torch.float16), + "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum": + lambda: torch._scaled_mm(a, + b, + scale_a, + scale_b, + out_dtype=torch.float16, + use_fast_accum=True), + "pytorch_fp8_fp8_bf16_scaled_mm": + lambda: torch._scaled_mm( + a, b, scale_a, scale_b, out_dtype=torch.bfloat16), + "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum": + lambda: torch._scaled_mm(a, + b, + scale_a, + scale_b, + out_dtype=torch.bfloat16, + use_fast_accum=True), + "cutlass_fp8_fp8_bf16_scaled_mm": + lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16), + "cutlass_fp8_fp8_fp16_scaled_mm": + lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.float16), + "cutlass_fp8_fp8_bf16_scaled_mm_bias": + lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16, + bias), + "cutlass_fp8_fp8_fp16_scaled_mm_bias": + lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.float16, + bias.to(dtype=torch.float16)), + "triton_fp8_fp8_fp16_scaled_mm_blockwise": + lambda: w8a8_block_fp8_matmul(a_cont, b.t(), block_scale_a, + block_scale_b.t(), (128, 128)), + "cutlass_fp8_fp8_fp16_scaled_mm_blockwise": + lambda: ops.cutlass_scaled_mm(a, b, block_scale_a_M_major, + block_scale_b_K_major, torch.float16), + } - # pytorch impl w. bf16 - timers.append( - bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales", - torch.mm, a.to(dtype=torch.bfloat16, device="cuda"), - b.to(dtype=torch.bfloat16, device="cuda"))) - - # pytorch impl: bf16 output, without fp8 fast accum - timers.append( - bench_fn(label, - sub_label, - "pytorch_fp8_fp8_bf16_scaled_mm", - torch._scaled_mm, - a, - b, - scale_a=scale_a, - scale_b=scale_b, - out_dtype=torch.bfloat16)) - - # pytorch impl: bf16 output, with fp8 fast accum - timers.append( - bench_fn(label, - sub_label, - "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum", - torch._scaled_mm, - a, - b, - scale_a=scale_a, - scale_b=scale_b, - out_dtype=torch.bfloat16, - use_fast_accum=True)) - - # pytorch impl: fp16 output, without fp8 fast accum - timers.append( - bench_fn(label, - sub_label, - "pytorch_fp8_fp8_fp16_scaled_mm", - torch._scaled_mm, - a, - b, - scale_a=scale_a, - scale_b=scale_b, - out_dtype=torch.float16)) - - # pytorch impl: fp16 output, with fp8 fast accum - timers.append( - bench_fn(label, - sub_label, - "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum", - torch._scaled_mm, - a, - b, - scale_a=scale_a, - scale_b=scale_b, - out_dtype=torch.float16, - use_fast_accum=True)) - - # cutlass impl: bf16 output - timers.append( - bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm", - ops.cutlass_scaled_mm, a, b, scale_a, scale_b, - torch.bfloat16)) - # cutlass impl: fp16 output - timers.append( - bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm", - ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16)) - - # cutlass impl: bf16 output, with bias - timers.append( - bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm_bias", - ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16, - bias)) - - # cutlass impl: fp16 output, with bias - timers.append( - bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm_bias", - ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16, - bias.to(dtype=torch.float16))) + timers = [] + for name, fn in bench_fns.items(): + # If bench_kernels is None, run all. Otherwise, run only exact matches. + if bench_kernels is None or name in bench_kernels: + print(f"Running {name}") + timers.append(bench_fn(label, sub_label, name, fn)) return timers -def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str, - sub_label: str) -> Iterable[TMeasurement]: +def bench(dtype: torch.dtype, + m: int, + k: int, + n: int, + label: str, + sub_label: str, + bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]: if dtype == torch.int8: - return bench_int8(dtype, m, k, n, label, sub_label) + return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels) if dtype == torch.float8_e4m3fn: - return bench_fp8(dtype, m, k, n, label, sub_label) + return bench_fp8(dtype, m, k, n, label, sub_label, bench_kernels) raise ValueError("unsupported type") @@ -207,18 +193,22 @@ def print_timers(timers: Iterable[TMeasurement]): def run(dtype: torch.dtype, - MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]: + MKNs: Iterable[Tuple[int, int, int]], + bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]: results = [] for m, k, n in MKNs: - timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm", - f"MKN=({m}x{k}x{n})") + timers = bench(dtype, + m, + k, + n, + f"scaled-{dtype}-gemm", + f"MKN=({m}x{k}x{n})", + bench_kernels=bench_kernels) print_timers(timers) results.extend(timers) - return results -# output makers def make_output(data: Iterable[TMeasurement], MKNs: Iterable[Tuple[int, int, int]], base_description: str, @@ -232,15 +222,11 @@ def make_output(data: Iterable[TMeasurement], pkl.dump(data, f) -# argparse runners - - def run_square_bench(args): dim_sizes = list( range(args.dim_start, args.dim_end + 1, args.dim_increment)) MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes)) - data = run(args.dtype, MKNs) - + data = run(args.dtype, MKNs, bench_kernels=args.kernels) make_output(data, MKNs, f"square_bench-{args.dtype}") @@ -251,8 +237,7 @@ def run_range_bench(args): Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes MKNs = list(zip(Ms, Ks, Ns)) - data = run(args.dtype, MKNs) - + data = run(args.dtype, MKNs, bench_kernels=args.kernels) make_output(data, MKNs, f"range_bench-{args.dtype}") @@ -278,7 +263,7 @@ def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]: for k, n in KNs: MKNs.append((m, k, n)) - data = run(args.dtype, MKNs) + data = run(args.dtype, MKNs, bench_kernels=args.kernels) model_bench_data.append(data) # Print all results @@ -328,6 +313,15 @@ def to_torch_dtype(dt): type=to_torch_dtype, required=True, help="Available options are ['int8', 'fp8']") + parser.add_argument( + "--kernels", + nargs="+", + type=str, + default=None, + help= + "Exact names of the kernels to benchmark. If not set, runs all kernels." + ) + subparsers = parser.add_subparsers(dest="cmd") square_parser = subparsers.add_parser("square_bench") @@ -362,4 +356,4 @@ def to_torch_dtype(dt): model_parser.set_defaults(func=run_model_bench) args = parser.parse_args() - args.func(args) \ No newline at end of file + args.func(args) diff --git a/csrc/core/math.hpp b/csrc/core/math.hpp index ba9f40a230c8e..ddfaca27147b4 100644 --- a/csrc/core/math.hpp +++ b/csrc/core/math.hpp @@ -1,7 +1,14 @@ +#pragma once + #include #include -inline uint32_t next_pow_2(uint32_t const num) { +inline constexpr uint32_t next_pow_2(uint32_t const num) { if (num <= 1) return num; return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1)); +} + +template +inline constexpr std::enable_if_t, T> ceil_div(T a, T b) { + return (a + b - 1) / b; } \ No newline at end of file diff --git a/csrc/cutlass_extensions/common.hpp b/csrc/cutlass_extensions/common.hpp index 07c9e46c27b06..febc4eccd9561 100644 --- a/csrc/cutlass_extensions/common.hpp +++ b/csrc/cutlass_extensions/common.hpp @@ -32,3 +32,20 @@ inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) { } int32_t get_sm_version_num(); + +/** + * A wrapper for a kernel that is used to guard against compilation on + * architectures that will never use the kernel. The purpose of this is to + * reduce the size of the compiled binary. + * __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef + * into code that will be executed on the device where it is defined. + */ +template +struct enable_sm90_or_later : Kernel { + template + CUTLASS_DEVICE void operator()(Args&&... args) { +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900 + Kernel::operator()(std::forward(args)...); +#endif + } +}; \ No newline at end of file diff --git a/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp b/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp new file mode 100644 index 0000000000000..ec75c29e54f4d --- /dev/null +++ b/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp @@ -0,0 +1,123 @@ +// Modified from: cutlass/gemm/collective/builders/sm90_gmma_builder.inl +// clang-format off +#pragma once + +#include "cutlass/gemm/collective/builders/sm90_gmma_builder.inl" + +#include "cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp" + + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass::gemm::collective { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA_TMA_WS_SS (BlockScaled Builders) +template < + class ElementA, + class GmemLayoutATag, + int AlignmentA, + class ElementB, + class GmemLayoutBTag, + int AlignmentB, + class ElementAccumulator, + class TileShape_MNK, + class ClusterShape_MNK, + class StageCountType, + int ScaleGranularityM +> +struct CollectiveBuilder< + arch::Sm90, + arch::OpClassTensorOp, + ElementA, + GmemLayoutATag, + AlignmentA, + ElementB, + GmemLayoutBTag, + AlignmentB, + ElementAccumulator, + TileShape_MNK, + ClusterShape_MNK, + StageCountType, + KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum, + cute::enable_if_t< + not detail::is_use_rmem_A()> +> { + using KernelScheduleType = KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum; + + static_assert(is_static::value); + static_assert(is_static::value); +#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED + static_assert(cutlass::detail::dependent_false, "Unsupported Toolkit for SM90 Collective Builder\n"); +#endif + static_assert(detail::is_aligned(), + "Should meet TMA alignment requirement\n"); + + static constexpr bool IsArrayOfPointersGemm = (cute::is_any_of_v); + static constexpr bool IsFP8Input = detail::is_input_fp8(); + static_assert((!IsFP8Input || !IsArrayOfPointersGemm), + "KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum is only compatible with FP8 Blocked Scaled version right now."); + + // For fp32 types, map to tf32 MMA value type + using ElementAMma = cute::conditional_t, tfloat32_t, ElementA>; + using ElementBMma = cute::conditional_t, tfloat32_t, ElementB>; + + static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_ss_tag_to_major_A(); + static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_ss_tag_to_major_B(); + + static constexpr bool IsCooperative = cute::is_any_of_v>; + using AtomLayoutMNK = cute::conditional_t>, Layout>>; + + using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector< + ElementAMma, ElementBMma, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>(), AtomLayoutMNK{})); + + using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{}))); + using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{}))); + + using SmemLayoutAtomA = decltype(detail::ss_smem_selector< + GmmaMajorA, ElementAMma, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>()); + using SmemLayoutAtomB = decltype(detail::ss_smem_selector< + GmmaMajorB, ElementBMma, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>()); + + static constexpr size_t TensorMapStorage = IsArrayOfPointersGemm ? sizeof(cute::TmaDescriptor) * 2 /* for A and B */ : 0; + static constexpr int KernelSmemCarveout = static_cast(TensorMapStorage); + + static constexpr int PipelineStages = detail::compute_stage_count_or_override(StageCountType{}); + using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8; + + using SmemCopyAtomA = void; + using SmemCopyAtomB = void; + + using CollectiveOp = CollectiveMma< + DispatchPolicy, + TileShape_MNK, + ElementA, + TagToStrideA_t, + ElementB, + TagToStrideB_t, + TiledMma, + GmemTiledCopyA, + SmemLayoutAtomA, + SmemCopyAtomA, + cute::identity, + GmemTiledCopyB, + SmemLayoutAtomB, + SmemCopyAtomB, + cute::identity + >; +}; + + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass::gemm::collective + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp b/csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp new file mode 100644 index 0000000000000..13b90e998625e --- /dev/null +++ b/csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp @@ -0,0 +1,183 @@ +// clang-format off +// adapted from: https://github.com/soundOfDestiny/cutlass/blob/a4208aa6958864923505cade9c63eb2a6daf16e5/include/cutlass/gemm/collective/fp8_accumulation.hpp + +/*************************************************************************************************** + * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +#pragma once + +#include "cute/algorithm/clear.hpp" +#include "cute/tensor.hpp" + +////////////////////////////////////////////////////////////////////////////// +///////////////////////////////////FP8 Accumulation/////////////////////////// +////////////////////////////////////////////////////////////////////////////// +/// This class provides API to promote (add) or scale (multiply_add) the results +/// from the tensor core accumulators to the main accumulators when the number +/// of MMAs reaches the max number of MMA interval specified by user, after that +/// the tensor core accumulators are zeroed. +////////////////////////////////////////////////////////////////////////////// + +namespace cutlass::gemm::collective { + +template < + class EngineAccum, + class LayoutAccum> +struct GmmaFP8AccumulationWithScale { + using TensorAccum = cute::Tensor; + using ElementAccumulator = typename EngineAccum::value_type; + + static_assert(is_static::value, "Accumulator Layout should be static"); + static_assert(is_rmem::value , "Accumulator tensor must be rmem resident."); + +private: + TensorAccum& accum_; + TensorAccum accum_temp_; + + uint32_t accum_promotion_interval_; // defines the max num of executed MMAs after which accum should be promoted. + uint32_t mma_count_per_mainloop_iteration_; // num of MMAs per k_tile of mainloop + uint32_t mma_count_; // current executed MMAs + uint32_t reset_accum_flag_; // accum needs to be zeroed or not. + + // promote or `add` the partial accumulators to main accumulator (FADD). + CUTLASS_DEVICE + void promote_core() { + warpgroup_wait<0>(); + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < size(accum_); ++i) { + accum_(i) += accum_temp_(i); + } + } + + // `multiply` scale the partial accumulators and `add` to main accumulator (FFMA). + template < + class EngineScale, + class LayoutScale> + CUTLASS_DEVICE + void scale_core(const cute::Tensor &scale) { + using TensorScale = cute::Tensor; + + static_assert(is_static::value, "Scale Layout should be static"); + static_assert(is_rmem::value , "Scale tensor must be rmem resident."); + + static_assert(LayoutAccum{}.shape() == LayoutScale{}.shape(), "Accumulator and scale must have same shape."); + + warpgroup_wait<0>(); + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < size(accum_); ++i) { + accum_(i) += accum_temp_(i) * scale(i); + } + } + +public: + CUTLASS_DEVICE + GmmaFP8AccumulationWithScale( + TensorAccum &accum, + uint32_t accum_promotion_interval, + uint32_t mma_count_per_mainloop_iteration) + : accum_(accum), + accum_promotion_interval_(accum_promotion_interval), + mma_count_per_mainloop_iteration_(mma_count_per_mainloop_iteration), + mma_count_(0), + reset_accum_flag_(0) + { + accum_temp_ = cute::make_fragment_like(accum); + } + + // + // Methods (Common) + // + + CUTLASS_DEVICE + TensorAccum& operator()() { + return accum_temp_; + } + + /// prepare the MMA accumulators when initialization or zeroing is required. + CUTLASS_DEVICE + bool prepare_if_needed() { + return reset_accum_flag_; + } + + // + // Methods (for FADD version) + // + + /// promote (add) the results from the MMA accumulators to main accumulator if needed. + CUTLASS_DEVICE + void promote_if_needed() { + mma_count_ += mma_count_per_mainloop_iteration_; + reset_accum_flag_ = __shfl_sync(0xffffffff, mma_count_ == accum_promotion_interval_, 0); + if (reset_accum_flag_) { + promote_core(); + mma_count_ = 0; + } + } + + /// promote (add) the residue results from the MMA accumulators to main accumulator if needed. + CUTLASS_DEVICE + void promote_residue_if_needed() { + if (__shfl_sync(0xffffffff, mma_count_ > 0, 0)) { + promote_core(); + } + } + + // + // Methods (for FFMA version) + // + + /// scale (multiply_add) the results from the MMA accumulators to main accumulator if needed. + template < + class EngineScale, + class LayoutScale> + CUTLASS_DEVICE + void scale_if_needed(const cute::Tensor &scale) { + mma_count_ += mma_count_per_mainloop_iteration_; + reset_accum_flag_ = __shfl_sync(0xffffffff, mma_count_ == accum_promotion_interval_, 0); + if (reset_accum_flag_) { + scale_core(scale); + mma_count_ = 0; + } + } + + /// scale (multiply_add) the residue results from the MMA accumulators to main accumulator if needed. + template < + class EngineScale, + class LayoutScale> + CUTLASS_DEVICE + void scale_residue_if_needed(const cute::Tensor &scale) { + if (__shfl_sync(0xffffffff, mma_count_ > 0, 0)) { + scale_core(scale); + } + } +}; + +} // namespace cutlass::gemm::collective diff --git a/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp b/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp new file mode 100644 index 0000000000000..928a9500cbb08 --- /dev/null +++ b/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp @@ -0,0 +1,730 @@ +// clang-format off +// Adapted (Heavily) from: https://github.com/soundOfDestiny/cutlass/blob/9d997ce0dea4c5fa1a617db6b7ff29aa9235822c/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp + +/*************************************************************************************************** + * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/dispatch_policy.hpp" +#include "cutlass/trace.h" +#include "cutlass/numeric_types.h" + +#include "cute/arch/cluster_sm90.hpp" +#include "cute/arch/copy_sm80.hpp" +#include "cute/arch/copy_sm90.hpp" +#include "cute/algorithm/functional.hpp" +#include "cute/atom/mma_atom.hpp" +#include "cute/algorithm/gemm.hpp" +#include "cute/tensor_predicate.hpp" +#include "cute/numeric/arithmetic_tuple.hpp" + +#include "cutlass_extensions/gemm/dispatch_policy.hpp" +#include "cutlass_extensions/gemm/collective/fp8_accumulation.hpp" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass::gemm::collective { +using namespace cute; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +// WarpSpecialized Mainloop +template < + int Stages, + class ClusterShape, + class KernelSchedule, + int ScaleGranularityM_, + class TileShape_, + class ElementA_, + class StrideA_, + class ElementB_, + class StrideB_, + class TiledMma_, + class GmemTiledCopyA_, + class SmemLayoutAtomA_, + class SmemCopyAtomA_, + class TransformA_, + class GmemTiledCopyB_, + class SmemLayoutAtomB_, + class SmemCopyAtomB_, + class TransformB_> +struct CollectiveMma< + MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8, + TileShape_, + ElementA_, + StrideA_, + ElementB_, + StrideB_, + TiledMma_, + GmemTiledCopyA_, + SmemLayoutAtomA_, + SmemCopyAtomA_, + TransformA_, + GmemTiledCopyB_, + SmemLayoutAtomB_, + SmemCopyAtomB_, + TransformB_> +{ + // + // Type Aliases + // + using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8; + using TileShape = TileShape_; + using ElementA = ElementA_; + using StrideA = StrideA_; + using ElementB = ElementB_; + using StrideB = StrideB_; + using TiledMma = TiledMma_; + using ElementAccumulator = typename TiledMma::ValTypeC; + using ElementBlockScale = ElementAccumulator; + using GmemTiledCopyA = GmemTiledCopyA_; + using GmemTiledCopyB = GmemTiledCopyB_; + using SmemLayoutAtomA = SmemLayoutAtomA_; + using SmemLayoutAtomB = SmemLayoutAtomB_; + using SmemCopyAtomA = SmemCopyAtomA_; + using SmemCopyAtomB = SmemCopyAtomB_; + using TransformA = TransformA_; + using TransformB = TransformB_; + using ArchTag = typename DispatchPolicy::ArchTag; + + using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{})); + using MainloopPipeline = cutlass::PipelineTmaAsync; + using PipelineState = cutlass::PipelineState; + using PipelineParams = typename MainloopPipeline::Params; + + // Two threads per CTA are producers (1 for operand tile and 32 for scales) + static constexpr int NumProducerThreadEvents = 33; + + static constexpr int ScaleGranularityM = ScaleGranularityM_ == 0 ? size<0>(TileShape{}) : ScaleGranularityM_; + static constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM; + + static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)"); + static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + + static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)"); + static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + + static_assert((size<0>(TileShape{}) % ScaleGranularityM) == 0, "FP8 scaling granularity must evenly divide tile shape along M."); + + // Tile along modes in a way that maximizes the TMA box size. + using SmemLayoutA = decltype(tile_to_shape( + SmemLayoutAtomA{}, + make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int{}), + cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{})); + using SmemLayoutB = decltype(tile_to_shape( + SmemLayoutAtomB{}, + make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int{}), + cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{})); + + // Block scaling gmem-to-smem copy atom + using SmemBlockScalingCopyAtomA = Copy_Atom, ElementBlockScale>; + using SmemBlockScalingCopyAtomB = Copy_Atom, ElementBlockScale>; + + // Block scaling smem layout + using SmemLayoutScaleA = Layout, Int>>; + using SmemLayoutScaleB = Layout>, Stride<_1>>; // `ScaleNsPerTile` is always 1. + + static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more."); + static_assert(cute::is_base_of::value && + cute::is_base_of::value, + "MMA atom must source both A and B operand from smem_desc for this mainloop."); + static_assert(cute::is_same_v || cute::is_same_v, + "GmemTiledCopy - invalid SM90 TMA copy atom specified."); + static_assert(cute::is_same_v || cute::is_same_v, + "GmemTiledCopy - invalid SM90 TMA copy atom specified."); + static_assert(cute::is_same_v, + "ElementAccumulator and ElementBlockScale should be same datatype"); + + struct SharedStorage + { + struct TensorStorage : cute::aligned_struct<128> { + cute::array_aligned> smem_A; // mxk + cute::array_aligned> smem_B; // nxk + cute::array_aligned> smem_scale_A; // ScaleMsPerTile x k + cute::array_aligned> smem_scale_B; // 1xk + } tensors; + + using PipelineStorage = typename MainloopPipeline::SharedStorage; + PipelineStorage pipeline; + }; + using TensorStorage = typename SharedStorage::TensorStorage; + using PipelineStorage = typename SharedStorage::PipelineStorage; + + // Host side kernel arguments + struct Arguments { + ElementA const* ptr_A; + StrideA dA; + ElementB const* ptr_B; + StrideB dB; + ElementBlockScale const* ptr_scale_A; + ElementBlockScale const* ptr_scale_B; + }; + + // Device side kernel params + struct Params { + // Assumption: StrideA is congruent with Problem_MK + using TMA_A = decltype(make_tma_copy_A_sm90( + GmemTiledCopyA{}, + make_tensor(static_cast(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}), + SmemLayoutA{}(_,_,0), + TileShape{}, + ClusterShape{})); + // Assumption: StrideB is congruent with Problem_NK + using TMA_B = decltype(make_tma_copy_B_sm90( + GmemTiledCopyB{}, + make_tensor(static_cast(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}), + SmemLayoutB{}(_,_,0), + TileShape{}, + ClusterShape{})); + TMA_A tma_load_a; + TMA_B tma_load_b; + uint32_t tma_transaction_bytes = TmaTransactionBytes; + uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK; + uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK; + // Block scaling factors for A and B + ElementBlockScale const* ptr_scale_A; + ElementBlockScale const* ptr_scale_B; + }; + + // + // Methods + // + + template + static constexpr Params + to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) { + (void) workspace; + + // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK) + auto problem_shape_MNKL = append<4>(problem_shape, 1); + auto [M,N,K,L] = problem_shape_MNKL; + + auto ptr_A = reinterpret_cast(args.ptr_A); + auto ptr_B = reinterpret_cast(args.ptr_B); + + Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA)); + Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB)); + typename Params::TMA_A tma_load_a = make_tma_copy_A_sm90( + GmemTiledCopyA{}, + tensor_a, + SmemLayoutA{}(_,_,cute::Int<0>{}), + TileShape{}, + ClusterShape{}); + typename Params::TMA_B tma_load_b = make_tma_copy_B_sm90( + GmemTiledCopyB{}, + tensor_b, + SmemLayoutB{}(_,_,cute::Int<0>{}), + TileShape{}, + ClusterShape{}); + uint32_t transaction_bytes_mk = TmaTransactionBytesMK; + uint32_t transaction_bytes_nk = TmaTransactionBytesNK; + uint32_t transaction_bytes = transaction_bytes_mk + transaction_bytes_nk; + + return { + tma_load_a, + tma_load_b, + transaction_bytes, + transaction_bytes_mk, + transaction_bytes_nk, + args.ptr_scale_A, + args.ptr_scale_B + }; + } + + template + static bool + can_implement( + ProblemShape const& problem_shape, + [[maybe_unused]] Arguments const& args) { + constexpr int tma_alignment_bits = 128; + auto problem_shape_MNKL = append<4>(problem_shape, 1); + auto [M,N,K,L] = problem_shape_MNKL; + + bool implementable = true; + constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits::value; + implementable = implementable && cutlass::detail::check_alignment(cute::make_shape(M,K,L), StrideA{}); + constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits::value; + implementable = implementable && cutlass::detail::check_alignment(cute::make_shape(N,K,L), StrideB{}); + + if (!implementable) { + CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n"); + } + return implementable; + } + + static constexpr int K_PIPE_MAX = DispatchPolicy::Stages; + static constexpr int K_PIPE_MMAS = 1; + static constexpr uint32_t TmaTransactionBytesMK = + cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast(sizeof_bits::value)); + static constexpr uint32_t TmaTransactionBytesNK = + cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast(sizeof_bits::value)); + static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK; + + /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance + CUTLASS_DEVICE + static void prefetch_tma_descriptors(Params const& mainloop_params) + { + cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor()); + cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor()); + } + + /// Set up the data needed by this collective for load and mma. + /// Returns a tuple of tensors. The collective and the kernel layer have the contract + /// Returned tuple must contain at least two elements, with the first two elements being: + /// gA_mkl - The tma tensor, A after a local tile so it has shape (BLK_M,BLK_K,m,k,l) + /// gB_nkl - The tma tensor, B after a local tile so it has shape (BLK_N,BLK_K,n,k,l) + template + CUTLASS_DEVICE auto + load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const { + using X = Underscore; + // Separate out problem shape for convenience + auto [M,N,K,L] = problem_shape_MNKL; + + // TMA requires special handling of strides to deal with coord codomain mapping + // Represent the full tensors -- get these from TMA + Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,L)); // (m,k,l) + Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L)); // (n,k,l) + + // Make tiled views, defer the slice + Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{}); // (BLK_M,BLK_K,m,k,l) + Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{}); // (BLK_N,BLK_K,n,k,l) + + constexpr auto scales_m = Int{}; + auto tM = get<2>(gA_mkl.shape()); + auto tN = get<2>(gB_nkl.shape()); + auto tK = get<3>(gA_mkl.shape()); + + // Make the tiled views of scale tensors + auto scaleA_shape = make_shape(M / ScaleGranularityM, tK, L); // (scale_m,k,l) + auto scaleA_layout = make_ordered_layout(scaleA_shape, Step<_0, _1, _2>{}); + auto scaleB_shape = make_shape(tN, tK, L); // (n,k,l) + auto scaleB_layout = make_ordered_layout(scaleB_shape, Step<_1, _0, _2>{}); + + // Note that mScaleA_mkl and mScaleB_nkl are already blocked tiled in the `m` host and + // gScaleA_mkl and gScaleB_nkl in `g` global memory are same as mScaleA_mkl and mScaleB_nkl. + Tensor mScaleA_mkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_A), scaleA_layout); // (scale_m,k,l) + Tensor mScaleB_nkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_B), scaleB_layout); // (n,k,l) + + return cute::make_tuple(gA_mkl, gB_nkl, mScaleA_mkl, mScaleB_nkl); + } + + /// Perform a collective-scoped matrix multiply-accumulate + /// Producer Perspective + template < + class TensorA, class TensorB, + class TensorScaleA, class TensorScaleB, + class KTileIterator, class BlockCoord + > + CUTLASS_DEVICE void + load( + Params const& mainloop_params, + MainloopPipeline pipeline, + PipelineState smem_pipe_write, + cute::tuple const& load_inputs, + BlockCoord const& blk_coord, + KTileIterator k_tile_iter, int k_tile_count, + int thread_idx, + uint32_t block_rank_in_cluster, + TensorStorage& shared_tensors) { + int lane_predicate = cute::elect_one_sync(); + + // Blockscaling: Tma loads for load_input and CpAsync for load_scale + Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE) + Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE) + Tensor sScaleA = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()), SmemLayoutScaleA{}); // (ScaleMsPerTile,k) + Tensor sScaleB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()), SmemLayoutScaleB{}); // (k) + + // + // Prepare the TMA loads for A and B + // + + constexpr uint32_t cluster_shape_x = get<0>(ClusterShape()); + uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x}; + + Tensor gA_mkl = get<0>(load_inputs); + Tensor gB_nkl = get<1>(load_inputs); + + auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y); + auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x); + + // Partition the inputs based on the current block coordinates. + auto [m_coord, n_coord, k_coord, l_coord] = blk_coord; + Tensor gA = gA_mkl(_,_,m_coord,_,l_coord); // (BLK_M,BLK_K,k) + Tensor gB = gB_nkl(_,_,n_coord,_,l_coord); // (BLK_N,BLK_K,k) + + + // Block scaling: load_scale has scaling tensors in global memory which are not tiled + Tensor mScaleA_mkl = get<2>(load_inputs); + Tensor mScaleB_nkl = get<3>(load_inputs); + auto scales_m = get<0>(mScaleA_mkl.shape()); + + Tensor cScaleA_mkl = make_identity_tensor(mScaleA_mkl.shape()); + + Tensor gScaleA = local_tile( + mScaleA_mkl, make_tile(Int{}), + make_coord(m_coord,_,l_coord)); // (ScaleMsPerTile,k,1) + Tensor cScaleA = local_tile( + cScaleA_mkl, make_tile(Int{}), + make_coord(m_coord,_,l_coord)); + Tensor gScaleB = mScaleB_nkl(n_coord,_,l_coord); // (1,k,1) + + // TODO: test `scale_copy_a` with `ScaleMsPerTile` < 128 + TiledCopy scale_copy_a = make_tiled_copy(SmemBlockScalingCopyAtomA{}, + Layout>{}, Layout>{}); // (1,1,1) + TiledCopy scale_copy_b = make_tiled_copy(SmemBlockScalingCopyAtomB{}, + Layout>{}, Layout>{}); // (1,1,1) + ThrCopy thr_scale_copy_a = scale_copy_a.get_slice(threadIdx.x); + ThrCopy thr_scale_copy_b = scale_copy_b.get_slice(threadIdx.x); + + Tensor tAgA_ScaleA = thr_scale_copy_a.partition_S(gScaleA); + Tensor tAcA_ScaleA = thr_scale_copy_a.partition_S(cScaleA); + Tensor tAsA_ScaleA = thr_scale_copy_a.partition_D(sScaleA); + + Tensor tBgB_ScaleB = thr_scale_copy_b.partition_S(gScaleB); + Tensor tBsB_ScaleB = thr_scale_copy_b.partition_D(sScaleB); + + // Applies the mapping from block_tma_a + Tensor tAgA = block_tma_a.partition_S(gA); // (TMA,TMA_M,TMA_K,k) + Tensor tAsA = block_tma_a.partition_D(sA); // (TMA,TMA_M,TMA_K,PIPE) + + Tensor tBgB = block_tma_b.partition_S(gB); // (TMA,TMA_N,TMA_K,k) + Tensor tBsB = block_tma_b.partition_D(sB); // (TMA,TMA_N,TMA_K,PIPE) + + uint16_t mcast_mask_a = 0; + uint16_t mcast_mask_b = 0; + + // Issue TmaLoads for GEMM operands A/B and CpAsync for scale tensors + // Maps the tile -> block, value + if constexpr (cute::is_same_v) { + auto block_layout = Layout{}; // (m,n) -> block_id + for (int n = 0; n < size<1>(block_layout); ++n) { + mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{})); + } + } + + if constexpr (cute::is_same_v) { + auto block_layout = Layout{}; // (m,n) -> block_id + for (int m = 0; m < size<0>(block_layout); ++m) { + mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{})); + } + } + + // Allocate predicate tensors for a_scales (since we can't guarantee that + // all scales are valid, since we could have a partial tiles along M) + Tensor tApA_ScaleA = make_tensor(shape(tAsA_ScaleA(_,_,0))); + #pragma unroll + for (int i = 0; i < size(tApA_ScaleA); ++i) { + tApA_ScaleA(i) = get<0>(tAcA_ScaleA(i)) < scales_m; + } + + // Mainloop + CUTLASS_PRAGMA_NO_UNROLL + for ( ; k_tile_count > 0; --k_tile_count) { + // LOCK smem_pipe_write for _writing_ + pipeline.producer_acquire(smem_pipe_write); + + // + // Copy gmem to smem for *k_tile_iter + // + int write_stage = smem_pipe_write.index(); + using BarrierType = typename MainloopPipeline::ProducerBarrierType; + BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write); + + // Copy operands A and B from global memory to shared memory + if (lane_predicate) copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage)); + if (lane_predicate) copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage)); + + // Copy scale tensors from global memory to shared memory + copy_if(scale_copy_a, tApA_ScaleA, tAgA_ScaleA(_,_,*k_tile_iter), tAsA_ScaleA(_,_,write_stage)); + copy(scale_copy_b, tBgB_ScaleB(_,*k_tile_iter), tBsB_ScaleB(_,write_stage)); + pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive_noinc); + + ++k_tile_iter; + + // Advance smem_pipe_write + ++smem_pipe_write; + } + } + + /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster + CUTLASS_DEVICE void + load_tail( + MainloopPipeline pipeline, + PipelineState smem_pipe_write) { + int lane_predicate = cute::elect_one_sync(); + + // Issue the epilogue waits + if (lane_predicate) { + /* This helps avoid early exit of blocks in Cluster + * Waits for all stages to either be released (all + * Consumer UNLOCKs), or if the stage was never used + * then would just be acquired since the phase was + * still inverted from make_producer_start_state + */ + pipeline.producer_tail(smem_pipe_write); + } + } + + /// Perform a collective-scoped matrix multiply-accumulate + /// Consumer Perspective + template < + class FrgTensorC + > + CUTLASS_DEVICE void + mma(MainloopPipeline pipeline, + PipelineState smem_pipe_read, + FrgTensorC& accum, + int k_tile_count, + int thread_idx, + TensorStorage& shared_tensors, + Params const& mainloop_params) { + + + static_assert(is_rmem::value, "C tensor must be rmem resident."); + static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3."); + static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3."); + static_assert(cute::is_void_v, + "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions."); + static_assert(cute::is_void_v, + "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions."); + + Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE) + Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE) + + // Block scaling + Tensor sScaleAViewAsC = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()), + Layout< + Shape, Int>, cute::tuple_element_t<1, TileShape>, Int>, + Stride, _0, Int> + >{}); // ((ScaleGranularityM,ScaleMsPerTile),n,k) + Tensor sScaleB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()), SmemLayoutScaleB{}); // (k) + + // + // Define C accumulators and A/B partitioning + // + + // Layout of warp group to thread mapping + + static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and + stride<0>(typename TiledMma::BLayout{}) == 0 and + size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and + size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, + "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup"); + + constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup; + Layout warp_group_thread_layout = make_layout(Int{}, + Int{}); + + int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0); + + TiledMma tiled_mma; + auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx)); + + Tensor tCsScaleAViewAsC = tiled_mma.get_slice(thread_idx).partition_C(sScaleAViewAsC); // (MMA,MMA_M,MMA_N,PIPE), `thread_mma` above is correct when partitioning A and B, but it is not correct when partitioning C. + + Tensor tCsA = thread_mma.partition_A(sA); // (MMA,MMA_M,MMA_K,PIPE) + Tensor tCsB = thread_mma.partition_B(sB); // (MMA,MMA_N,MMA_K,PIPE) + + // Allocate "fragments/descriptors" + Tensor tCrA = thread_mma.make_fragment_A(tCsA); // (MMA,MMA_M,MMA_K,PIPE) + Tensor tCrB = thread_mma.make_fragment_B(tCsB); // (MMA,MMA_N,MMA_K,PIPE) + + CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum)); // M + CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum)); // N + CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB)); // K + CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB)); // PIPE + CUTE_STATIC_ASSERT_V(Int{} == size<2>(sA)); // PIPE + CUTE_STATIC_ASSERT_V(Int{} == size<2>(sB)); // PIPE + + // + // PIPELINED MAIN LOOP + // + static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS < K_PIPE_MAX), + "ERROR : Incorrect number of MMAs in flight"); + + // We release buffers to producer warps(dma load) with some mmas in flight + PipelineState smem_pipe_release = smem_pipe_read; + + // Per block scale values for operand A and B + + using RegLayoutScaleAViewAsC = decltype(make_layout_like(tCsScaleAViewAsC(_, _, _, 0).layout())); // `make_layout_like` makes a compact layout. + using RegLayoutScaleAEssential = decltype(filter_zeros(RegLayoutScaleAViewAsC{}.stride(), RegLayoutScaleAViewAsC{}.shape())); // an interface to traverse the underlying storage for the compact layout mentioned above + + Tensor tCrScaleAViewAsC = make_tensor(RegLayoutScaleAViewAsC{}); // (MMA,MMA_M,MMA_N) + ElementBlockScale scale_b; + + // Prologue GMMAs + int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count); + + tiled_mma.accumulate_ = GMMA::ScaleOut::Zero; + + GmmaFP8AccumulationWithScale accumulation(accum, size<2>(TileShape{}) / size<2>(typename TiledMma::AtomShape_MNK{}), size<2>(tCrA)); + warpgroup_fence_operand(accumulation()); + CUTLASS_PRAGMA_UNROLL + for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue) + { + // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value) + auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read); + pipeline.consumer_wait(smem_pipe_read, barrier_token); + + if (accumulation.prepare_if_needed()) { + tiled_mma.accumulate_ = GMMA::ScaleOut::Zero; + } + + int read_stage = smem_pipe_read.index(); + + // Load per block scale values from shared memory to registers. + scale_b = sScaleB[read_stage]; + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) { + tCrScaleAViewAsC.data()[i] = tCsScaleAViewAsC(_, _, _, read_stage)(idx2crd(i, RegLayoutScaleAEssential{})); + } + if constexpr (ScaleMsPerTile == 1) { + static_assert(size(RegLayoutScaleAEssential{}) == 1); + tCrScaleAViewAsC.data()[0] = __shfl_sync(0xffffffff, tCrScaleAViewAsC.data()[0] * scale_b, 0); // `tCrScaleAViewAsC.data()[0]` are all same in a warp group when `ScaleMsPerTile == 1`. + } else { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) { + tCrScaleAViewAsC.data()[i] = tCrScaleAViewAsC.data()[i] * scale_b; + } + } + + warpgroup_arrive(); + // Unroll the K mode manually to set scale D to 1 + CUTLASS_PRAGMA_UNROLL + for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) { + // (V,M,K) x (V,N,K) => (V,M,N) + cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation()); + tiled_mma.accumulate_ = GMMA::ScaleOut::One; + } + warpgroup_commit_batch(); + + // Block scale the accumulators with reg tensor `tCrScaleAViewAsC` + accumulation.scale_if_needed(tCrScaleAViewAsC); + + ++smem_pipe_read; + } + + warpgroup_fence_operand(accumulation()); + // Mainloop GMMAs + k_tile_count -= prologue_mma_count; + + CUTLASS_PRAGMA_NO_UNROLL + for ( ; k_tile_count > 0; --k_tile_count) + { + // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value) + auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read); + pipeline.consumer_wait(smem_pipe_read, barrier_token); + + // + // Compute on k_tile + // + + int read_stage = smem_pipe_read.index(); + + // Load per block scale values from shared memory to registers (at most twice per block along M and exactly once per block along N) + scale_b = sScaleB[read_stage]; + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) { + tCrScaleAViewAsC.data()[i] = tCsScaleAViewAsC(_, _, _, read_stage)(idx2crd(i, RegLayoutScaleAEssential{})); + } + if constexpr (ScaleMsPerTile == 1) { + static_assert(size(RegLayoutScaleAEssential{}) == 1); + tCrScaleAViewAsC.data()[0] = __shfl_sync(0xffffffff, tCrScaleAViewAsC.data()[0] * scale_b, 0); // `tCrScaleAViewAsC.data()[0]` are all same in a warp group when `ScaleMsPerTile == 1`. + } else { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) { + tCrScaleAViewAsC.data()[i] = tCrScaleAViewAsC.data()[i] * scale_b; + } + } + + if (accumulation.prepare_if_needed()) { + tiled_mma.accumulate_ = GMMA::ScaleOut::Zero; + } + + warpgroup_fence_operand(accumulation()); + warpgroup_arrive(); + // Unroll the K mode manually to set scale D to 1 + CUTLASS_PRAGMA_UNROLL + for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) { + // (V,M,K) x (V,N,K) => (V,M,N) + cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation()); + tiled_mma.accumulate_ = GMMA::ScaleOut::One; + } + warpgroup_commit_batch(); + + /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed + warpgroup_wait(); + warpgroup_fence_operand(accumulation()); + + // Block scale the accumulators with reg tensor `tCrScaleAViewAsC` + accumulation.scale_if_needed(tCrScaleAViewAsC); + + pipeline.consumer_release(smem_pipe_release); // UNLOCK smem_pipe_release, done _computing_ on it + + // Advance smem_pipe_read and smem_pipe_release + ++smem_pipe_read; + ++smem_pipe_release; + } + + accumulation.scale_residue_if_needed(tCrScaleAViewAsC); + + warpgroup_fence_operand(accumulation()); + } + + /// Perform a Consumer Epilogue to release all buffers + CUTLASS_DEVICE void + mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) { + // Prologue GMMAs + int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count); + k_tile_count -= prologue_mma_count; + + smem_pipe_release.advance(k_tile_count); + + // Wait on all GMMAs to complete + warpgroup_wait<0>(); + + for (int count = 0; count < prologue_mma_count; ++count) { + pipeline.consumer_release(smem_pipe_release); // UNLOCK smem_pipe_release, done _computing_ on it + ++smem_pipe_release; + } + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass::gemm::collective + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/csrc/cutlass_extensions/gemm/dispatch_policy.hpp b/csrc/cutlass_extensions/gemm/dispatch_policy.hpp new file mode 100644 index 0000000000000..df809e27a3efe --- /dev/null +++ b/csrc/cutlass_extensions/gemm/dispatch_policy.hpp @@ -0,0 +1,39 @@ +#pragma once + +#include "cutlass/gemm/dispatch_policy.hpp" + +namespace cutlass::gemm { + +////////////////////////////////////////////////////////////////////////////// + +// FP8 related policies (including Blocked Scaled Accumulation) +// `ScaleGranularityM` specifies scaling granularity along M, while zero-value +// `ScaleGranularityM` indicates that scaling granularity is +// `size<0>(TileShape_MNK{})` along M. +template +struct KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum + : KernelTmaWarpSpecializedCooperative {}; + +// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp +// specialized dynamic schedule For FP8 kernels with Block Scaling +template , + class KernelSchedule = KernelTmaWarpSpecialized, + int ScaleGranularityM = + 0 // `ScaleGranularityM` specifies scaling granularity along M, + // while zero-value `ScaleGranularityM` indicates that scaling + // granularity is `size<0>(TileShape_MNK{})` along M. + > +struct MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8 + : MainloopSm90TmaGmmaWarpSpecialized { + static_assert( + cute::is_same_v< + KernelSchedule, + KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum< + ScaleGranularityM>>, + "KernelSchedule must be one of the warp specialized policies"); +}; + +////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass::gemm \ No newline at end of file diff --git a/csrc/cutlass_extensions/vllm_collective_builder.cuh b/csrc/cutlass_extensions/vllm_collective_builder.cuh index 085ee1290031f..e7fbba4cd4b0d 100644 --- a/csrc/cutlass_extensions/vllm_collective_builder.cuh +++ b/csrc/cutlass_extensions/vllm_collective_builder.cuh @@ -1,6 +1,6 @@ #pragma once -#include "cutlass/gemm/collective/collective_builder.hpp" +#include "cutlass_extensions/gemm/collective/collective_builder.hpp" namespace cutlass::gemm::collective { using namespace cute; diff --git a/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh b/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh new file mode 100644 index 0000000000000..9ac7eee7204ec --- /dev/null +++ b/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh @@ -0,0 +1,93 @@ +#pragma once + +// clang-format will break include orders +// clang-format off +#include + +#include + +#include "cutlass/cutlass.h" + +#include "cute/tensor.hpp" +#include "cute/atom/mma_atom.hpp" +#include "cutlass/numeric_types.h" + +#include "cutlass/gemm/device/gemm_universal_adapter.h" +#include "cutlass/gemm/kernel/gemm_universal.hpp" +#include "cutlass/epilogue/collective/collective_builder.hpp" +#include "cutlass/gemm/collective/collective_builder.hpp" + +#include "core/math.hpp" +#include "cutlass_extensions/common.hpp" +// clang-format on + +namespace vllm::c3x { + +static inline cute::Shape get_problem_shape( + torch::Tensor const& a, torch::Tensor const& b) { + int32_t m = a.size(0), n = b.size(1), k = a.size(1); + return {m, n, k, 1}; +} + +template +void cutlass_gemm_caller(torch::Device device, + cute::Shape prob_shape, + typename GemmKernel::MainloopArguments mainloop_args, + typename GemmKernel::EpilogueArguments epilogue_args) { + typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm, + prob_shape, mainloop_args, epilogue_args}; + + // Launch the CUTLASS GEMM kernel. + using GemmOp = cutlass::gemm::device::GemmUniversalAdapter; + GemmOp gemm_op; + CUTLASS_CHECK(gemm_op.can_implement(args)); + + size_t workspace_size = gemm_op.get_workspace_size(args); + auto const workspace_options = + torch::TensorOptions().dtype(torch::kUInt8).device(device); + auto workspace = torch::empty(workspace_size, workspace_options); + + auto stream = at::cuda::getCurrentCUDAStream(device.index()); + + cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream); + CUTLASS_CHECK(status); +} + +template +void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + EpilogueArgs&&... epilogue_params) { + using ElementAB = typename Gemm::ElementAB; + using ElementD = typename Gemm::ElementD; + using GemmKernel = typename Gemm::GemmKernel; + + int64_t lda = a.stride(0); + int64_t ldb = b.stride(1); + int64_t ldc = out.stride(0); + + using StrideA = cute::Stride, int64_t>; + using StrideB = cute::Stride, int64_t>; + using StrideC = typename Gemm::StrideC; + + StrideA a_stride{lda, cute::Int<1>{}, 0}; + StrideB b_stride{ldb, cute::Int<1>{}, 0}; + StrideC c_stride{ldc, cute::Int<1>{}, cute::Int<0>{}}; + + typename GemmKernel::ProblemShape prob_shape = get_problem_shape(a, b); + + auto a_ptr = static_cast(a.data_ptr()); + auto b_ptr = static_cast(b.data_ptr()); + typename GemmKernel::MainloopArguments mainloop_args{a_ptr, a_stride, b_ptr, + b_stride}; + + auto c_ptr = static_cast(out.data_ptr()); + typename GemmKernel::EpilogueArguments epilogue_args{ + Gemm::Epilogue::prepare_args( + std::forward(epilogue_params)...), + c_ptr, c_stride, c_ptr, c_stride}; + + cutlass_gemm_caller(a.device(), prob_shape, mainloop_args, + epilogue_args); +} + +} // namespace vllm::c3x \ No newline at end of file diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh similarity index 51% rename from csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh rename to csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh index d4bc2f0ade50d..9227ebb735245 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh @@ -2,9 +2,6 @@ // clang-format will break include orders // clang-format off -#include - -#include #include "cutlass/cutlass.h" @@ -32,21 +29,6 @@ using namespace cute; namespace vllm { -// A wrapper for the GEMM kernel that is used to guard against compilation on -// architectures that will never use the kernel. The purpose of this is to -// reduce the size of the compiled binary. -// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef -// into code that will be executed on the device where it is defined. -template -struct enable_sm90_or_later : Kernel { - template - CUTLASS_DEVICE void operator()(Args&&... args) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900 - Kernel::operator()(std::forward(args)...); -#endif - } -}; - template typename Epilogue_, typename TileShape, typename ClusterShape, typename KernelSchedule, @@ -101,60 +83,4 @@ struct cutlass_3x_gemm { struct GemmKernel : public KernelType {}; }; -template -void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a, - torch::Tensor const& b, - EpilogueArgs&&... epilogue_params) { - using ElementAB = typename Gemm::ElementAB; - using ElementD = typename Gemm::ElementD; - - int32_t m = a.size(0); - int32_t n = b.size(1); - int32_t k = a.size(1); - - int64_t lda = a.stride(0); - int64_t ldb = b.stride(1); - int64_t ldc = out.stride(0); - - using StrideA = Stride, int64_t>; - using StrideB = Stride, int64_t>; - using StrideC = typename Gemm::StrideC; - - StrideA a_stride{lda, Int<1>{}, 0}; - StrideB b_stride{ldb, Int<1>{}, 0}; - StrideC c_stride{ldc, Int<1>{}, Int<0>{}}; - - using GemmKernel = typename Gemm::GemmKernel; - typename GemmKernel::ProblemShape prob_shape{m, n, k, 1}; - - auto a_ptr = static_cast(a.data_ptr()); - auto b_ptr = static_cast(b.data_ptr()); - typename GemmKernel::MainloopArguments mainloop_args{a_ptr, a_stride, b_ptr, - b_stride}; - - auto c_ptr = static_cast(out.data_ptr()); - typename GemmKernel::EpilogueArguments epilogue_args{ - Gemm::Epilogue::prepare_args( - std::forward(epilogue_params)...), - c_ptr, c_stride, c_ptr, c_stride}; - - typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm, - prob_shape, mainloop_args, epilogue_args}; - - // Launch the CUTLASS GEMM kernel. - using GemmOp = cutlass::gemm::device::GemmUniversalAdapter; - GemmOp gemm_op; - CUTLASS_CHECK(gemm_op.can_implement(args)); - - size_t workspace_size = gemm_op.get_workspace_size(args); - auto const workspace_options = - torch::TensorOptions().dtype(torch::kUInt8).device(a.device()); - auto workspace = torch::empty(workspace_size, workspace_options); - - auto stream = at::cuda::getCurrentCUDAStream(a.get_device()); - - cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream); - CUTLASS_CHECK(status); -} - } // namespace vllm diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu new file mode 100644 index 0000000000000..4cd38f4975df7 --- /dev/null +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu @@ -0,0 +1,24 @@ +#include "scaled_mm_kernels.hpp" +#include "scaled_mm_sm90_int8_dispatch.cuh" +#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp" + +namespace vllm { + +void cutlass_scaled_mm_azp_sm90_int8(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales, + torch::Tensor const& azp_adj, + std::optional const& azp, + std::optional const& bias) { + if (azp) { + return cutlass_scaled_mm_sm90_int8_epilogue< + c3x::ScaledEpilogueBiasAzpToken>(out, a, b, a_scales, b_scales, azp_adj, + *azp, bias); + } else { + return cutlass_scaled_mm_sm90_int8_epilogue( + out, a, b, a_scales, b_scales, azp_adj, bias); + } +} + +} // namespace vllm diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu new file mode 100644 index 0000000000000..0501e6da160e2 --- /dev/null +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu @@ -0,0 +1,24 @@ + +#include "scaled_mm_kernels.hpp" +#include "scaled_mm_blockwise_sm90_fp8_dispatch.cuh" +#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp" + +namespace vllm { + +void cutlass_scaled_mm_blockwise_sm90_fp8(torch::Tensor& out, + torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales) { + if (out.dtype() == torch::kBFloat16) { + cutlass_gemm_blockwise_sm90_fp8_dispatch( + out, a, b, a_scales, b_scales); + + } else { + TORCH_CHECK(out.dtype() == torch::kFloat16); + cutlass_gemm_blockwise_sm90_fp8_dispatch( + out, a, b, a_scales, b_scales); + } +} + +} // namespace vllm \ No newline at end of file diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh new file mode 100644 index 0000000000000..fb7a82b80ee65 --- /dev/null +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh @@ -0,0 +1,168 @@ +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/numeric_types.h" + +#include "cute/tensor.hpp" +#include "cutlass/tensor_ref.h" +#include "cutlass/gemm/dispatch_policy.hpp" +#include "cutlass/gemm/collective/collective_builder.hpp" +#include "cutlass/gemm/device/gemm_universal_adapter.h" +#include "cutlass/gemm/kernel/gemm_universal.hpp" +#include "cutlass/gemm/kernel/tile_scheduler_params.h" +#include "cutlass/epilogue/dispatch_policy.hpp" +#include "cutlass/epilogue/collective/collective_builder.hpp" + +#include "cutlass_extensions/gemm/dispatch_policy.hpp" +#include "cutlass_extensions/gemm/collective/collective_builder.hpp" + +#include "cutlass_gemm_caller.cuh" + +namespace vllm { + +using namespace cute; + +template > +struct cutlass_3x_gemm_fp8_blockwise { + using GroupSizeM = Int; + using GroupSizeN = Int; + using GroupSizeK = Int; + using TileSizeM = Int; + + static_assert(TileSizeM_ % GroupSizeM_ == 0, + "TileSizeM must be a multiple of GroupSizeM"); + + using ElementAB = cutlass::float_e4m3_t; + + using ElementA = ElementAB; + using LayoutA = cutlass::layout::RowMajor; + static constexpr int AlignmentA = 128 / cutlass::sizeof_bits::value; + + using ElementB = ElementAB; + using LayoutB = cutlass::layout::ColumnMajor; + static constexpr int AlignmentB = 128 / cutlass::sizeof_bits::value; + + using ElementD = OutType; + using StrideD = Stride, Int<0>>; + static constexpr int AlignmentD = 128 / cutlass::sizeof_bits::value; + + using ElementC = void; + using StrideC = StrideD; + static constexpr int AlignmentC = AlignmentD; + + using ElementAccumulator = float; + using ElementBlockScale = float; + using ElementCompute = float; + using ArchTag = cutlass::arch::Sm90; + using OperatorClass = cutlass::arch::OpClassTensorOp; + using TileShape = Shape; + + using KernelSchedule = cutlass::gemm:: + KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum< + GroupSizeM_>; + using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative; + using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto; + + using StoreEpilogueCompute = typename cutlass::epilogue::fusion::Sm90EVT< + cutlass::epilogue::fusion::Sm90AccFetch>; + + using CollectiveEpilogue = + typename cutlass::epilogue::collective::CollectiveBuilder< + ArchTag, OperatorClass, TileShape, ClusterShape, EpilogueTileType, + ElementAccumulator, ElementCompute, ElementC, StrideC, AlignmentC, + ElementD, StrideD, AlignmentD, EpilogueSchedule, + StoreEpilogueCompute>::CollectiveOp; + + using CollectiveMainloop = + typename cutlass::gemm::collective::CollectiveBuilder< + ArchTag, OperatorClass, ElementA, LayoutA, AlignmentA, ElementB, + LayoutB, AlignmentB, ElementAccumulator, TileShape, ClusterShape, + cutlass::gemm::collective::StageCountAutoCarveout( + sizeof(typename CollectiveEpilogue::SharedStorage))>, + KernelSchedule>::CollectiveOp; + + using KernelType = enable_sm90_or_later, CollectiveMainloop, CollectiveEpilogue, + cutlass::gemm::PersistentScheduler>>; + + struct GemmKernel : public KernelType {}; + + using StrideA = typename GemmKernel::StrideA; + using StrideB = typename GemmKernel::StrideB; +}; + +template +void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales) { + using GemmKernel = typename Gemm::GemmKernel; + + using ElementAB = typename Gemm::ElementAB; + using ElementD = typename Gemm::ElementD; + + auto prob_shape = c3x::get_problem_shape(a, b); + int32_t m = get<0>(prob_shape), n = get<1>(prob_shape), + k = get<2>(prob_shape); + + int64_t lda = a.stride(0); + int64_t ldb = b.stride(1); + int64_t ldc = out.stride(0); + + using StrideA = Stride, int64_t>; + using StrideB = Stride, int64_t>; + using StrideC = typename Gemm::StrideC; + + StrideA a_stride{lda, Int<1>{}, 0}; + StrideB b_stride{ldb, Int<1>{}, 0}; + StrideC c_stride{ldc, Int<1>{}, Int<0>{}}; + + auto a_ptr = static_cast(a.data_ptr()); + auto b_ptr = static_cast(b.data_ptr()); + auto a_scales_ptr = static_cast(a_scales.data_ptr()); + auto b_scales_ptr = static_cast(b_scales.data_ptr()); + + // Check is the t is contiguous and is 1D or 2D with one of the dimensions + // being 1 (i.e. a row or column vector) + auto is_contiguous_vector = [](const torch::Tensor& t) { + auto t_sizes = t.sizes(); + return t.is_contiguous() && + (t.dim() == 1 || + (t.dim() == 2 && + *std::min_element(t_sizes.begin(), t_sizes.end()) == 1)); + }; + + // TODO(lucas): lets clean-up the kernel so that we pass in Strides so + // we don't have to deal with enforcing implicit layouts + TORCH_CHECK(a_scales.size(0) == m / Gemm::GroupSizeM::value); + TORCH_CHECK(a_scales.size(1) == k / Gemm::GroupSizeK::value); + TORCH_CHECK(a_scales.stride(0) == 1 || is_contiguous_vector(a_scales), + "a_scales must be M major"); + TORCH_CHECK(b_scales.size(0) == k / Gemm::GroupSizeK::value); + TORCH_CHECK(b_scales.size(1) == n / Gemm::GroupSizeN::value); + TORCH_CHECK(b_scales.stride(0) == 1 || is_contiguous_vector(b_scales), + "b_scales must be K major"); + typename GemmKernel::MainloopArguments mainloop_args{ + a_ptr, a_stride, b_ptr, b_stride, a_scales_ptr, b_scales_ptr}; + + auto c_ptr = static_cast(out.data_ptr()); + typename GemmKernel::EpilogueArguments epilogue_args{ + {}, c_ptr, c_stride, c_ptr, c_stride}; + + c3x::cutlass_gemm_caller(a.device(), prob_shape, mainloop_args, + epilogue_args); +} + +template +void cutlass_gemm_blockwise_sm90_fp8_dispatch(torch::Tensor& out, + torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales) { + cutlass_gemm_caller_blockwise< + cutlass_3x_gemm_fp8_blockwise>(out, a, b, a_scales, + b_scales); +} + +} // namespace vllm \ No newline at end of file diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp new file mode 100644 index 0000000000000..7ede9e067477b --- /dev/null +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp @@ -0,0 +1,33 @@ +#pragma once + +#include + +namespace vllm { + +void cutlass_scaled_mm_sm90_fp8(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales, + std::optional const& bias); + +void cutlass_scaled_mm_sm90_int8(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales, + std::optional const& bias); + +void cutlass_scaled_mm_azp_sm90_int8(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales, + torch::Tensor const& azp_adj, + std::optional const& azp, + std::optional const& bias); + +void cutlass_scaled_mm_blockwise_sm90_fp8(torch::Tensor& out, + torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales); + +} // namespace vllm diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu new file mode 100644 index 0000000000000..e092c61abc249 --- /dev/null +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu @@ -0,0 +1,24 @@ +#include "scaled_mm_kernels.hpp" +#include "scaled_mm_sm90_fp8_dispatch.cuh" +#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp" + +namespace vllm { + +void cutlass_scaled_mm_sm90_fp8(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales, + std::optional const& bias) { + TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous()); + if (bias) { + TORCH_CHECK(bias->dtype() == out.dtype(), + "currently bias dtype must match output dtype ", out.dtype()); + return cutlass_scaled_mm_sm90_fp8_epilogue( + out, a, b, a_scales, b_scales, *bias); + } else { + return cutlass_scaled_mm_sm90_fp8_epilogue( + out, a, b, a_scales, b_scales); + } +} + +} // namespace vllm diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8_dispatch.cuh similarity index 76% rename from csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_fp8_dispatch.cuh rename to csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8_dispatch.cuh index f08419b3122b2..32ea5db3321bc 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_fp8_dispatch.cuh +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8_dispatch.cuh @@ -1,6 +1,7 @@ #pragma once -#include "scaled_mm_c3x.cuh" +#include "scaled_mm.cuh" +#include "cutlass_gemm_caller.cuh" /** * This file defines Gemm kernel configurations for SM90 (fp8) based on the Gemm @@ -9,6 +10,8 @@ namespace vllm { +using c3x::cutlass_gemm_caller; + template typename Epilogue> struct sm90_fp8_config_default { @@ -93,4 +96,25 @@ inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, } } +template